Compare commits

..

1290 Commits

Author SHA1 Message Date
Martin Kroeker
51c22612eb Merge pull request #2907 from xianyi/develop
Update from develop for 0.3.11
2020-10-17 22:14:12 +02:00
Martin Kroeker
b8f689200e Update version number to 0.3.11 2020-10-17 22:11:34 +02:00
Martin Kroeker
fe9015b619 Update version for 0.3.11 release 2020-10-17 22:10:50 +02:00
Martin Kroeker
f99b8c1502 Merge pull request #2906 from martin-frbg/changelog-0311
Update Changelog.txt with the 0.3.11 changes
2020-10-17 22:07:14 +02:00
Martin Kroeker
5381a18056 Update Changelog.txt with the 0.3.11 changes 2020-10-17 22:05:36 +02:00
Martin Kroeker
e35576c6fc Merge pull request #2905 from martin-frbg/aocc-clang
Add -mavx for clang & aocc
2020-10-17 09:45:22 +02:00
Martin Kroeker
f1bb85d378 Add AVX flags for clang/aocc as well 2020-10-16 20:52:15 +02:00
Martin Kroeker
25907e672b Merge pull request #101 from xianyi/develop
rebase
2020-10-16 20:48:58 +02:00
Martin Kroeker
9789375389 Merge pull request #2900 from martin-frbg/fixcmake_sse
Add compiler options for SSE to the cmake support files
2020-10-16 16:17:36 +02:00
Martin Kroeker
f64243ff57 Add compiler options for sse/sse2/ssse3/sse4.1 2020-10-16 10:47:06 +02:00
Martin Kroeker
786c0a3ce8 Add sse options for use of intrinics with older compilers 2020-10-16 10:41:53 +02:00
Martin Kroeker
df70667043 fix core list for sse/sse2 2020-10-16 09:55:48 +02:00
Martin Kroeker
e6c5b13a18 Merge pull request #2898 from martin-frbg/morefixes
More pre-release fixes
2020-10-16 07:26:39 +02:00
Martin Kroeker
f071d1207a add sse2 2020-10-15 22:10:32 +02:00
Martin Kroeker
dc6cefd2f5 Expressly enable -msse for 32bit DYNAMIC_ARCH kernels 2020-10-15 20:16:15 +02:00
Martin Kroeker
c339c40c01 Silence a redefinition warning 2020-10-15 19:08:12 +02:00
Martin Kroeker
ac8af9cec6 Add -msse where supported, apparently required for older gcc 2020-10-15 19:06:45 +02:00
Martin Kroeker
10379fc83b Use ifdef instead of if 2020-10-15 19:05:37 +02:00
Martin Kroeker
a85ac71633 Merge pull request #100 from xianyi/develop
rebase
2020-10-15 18:54:20 +02:00
Martin Kroeker
4c25910da0 Merge pull request #2896 from martin-frbg/intrin-double
Add compiler flag for SSE4 where available
2020-10-15 11:12:35 +02:00
Martin Kroeker
9b9ee92d5f Merge pull request #2897 from Qiyu8/usimd-double
Add double precision universal intrinsics for X86/ARM
2020-10-15 08:38:24 +02:00
Martin Kroeker
ae6ac83991 Revert "add double precision SSE" 2020-10-15 08:37:02 +02:00
Qiyu8
4fac91ef37 adapt arm platform 2020-10-15 11:08:10 +08:00
Qiyu8
bfdf4b56da Add double precision universal intrinsics for X86/ARM 2020-10-15 10:29:42 +08:00
Martin Kroeker
ebf0470fc2 add sse4.1 for DYNAMIC_ARCH kernels 2020-10-14 20:34:33 +02:00
Martin Kroeker
ca160bb440 Add -msse4.1 when SSE4.1 is supported 2020-10-14 19:18:07 +02:00
Martin Kroeker
c9c3ae07af Add double precision operations 2020-10-14 18:10:45 +02:00
Martin Kroeker
a897bc3bd2 Merge pull request #99 from xianyi/develop
rebase
2020-10-14 18:09:20 +02:00
Martin Kroeker
756802df61 Merge pull request #2890 from martin-frbg/s-d-sum
Revert special handling of Windows xNRM2 and enable C+intrinsics kern…
2020-10-14 09:02:03 +02:00
Martin Kroeker
01492decf4 Merge pull request #2895 from martin-frbg/sb-tests
Fix remaining build errors related to bfloat16 and cmake
2020-10-14 09:01:16 +02:00
Martin Kroeker
bd0752444a Merge pull request #2894 from RajalakshmiSR/bf16_packing
POWER10: Change the packing format for bfloat16
2020-10-14 08:12:08 +02:00
Martin Kroeker
c1f4f5d4e7 Replace Makefile with simplified version again 2020-10-14 01:08:50 +02:00
Martin Kroeker
75e3a92df6 Add express -mavx and -msse options (and fix a stray = for cooperlake) 2020-10-14 01:01:58 +02:00
Martin Kroeker
2a329baa81 Add the BFLOAT16 functions to cmake builds 2020-10-13 23:21:38 +02:00
Rajalakshmi Srinivasaraghavan
0826d68f93 POWER10: Change the packing format for bfloat16
As the new MMA instructions need the inputs in 4x2 order for bfloat16,
changing the format in copy/packing code.  This avoids permute instructions
in the gemm kernel inner loop.
2020-10-13 16:05:10 -05:00
Martin Kroeker
4bb73c0171 Rename "HALF" type to "BFLOAT16" 2020-10-13 20:07:19 +02:00
Martin Kroeker
bc5c7f9578 Cleanup 2020-10-13 19:56:09 +02:00
Martin Kroeker
437b7fe261 sh prefix renamed to sb 2020-10-13 19:55:14 +02:00
Martin Kroeker
a0ada4bcb8 Merge pull request #98 from xianyi/develop
rebase
2020-10-13 18:50:30 +02:00
Martin Kroeker
602a0c7a69 Merge pull request #2892 from RajalakshmiSR/bf16_make
Fix build issues with bfloat16
2020-10-13 18:48:37 +02:00
Rajalakshmi Srinivasaraghavan
b5d30b390d Fix build issues with bfloat16
This patch fixes compilation errors due to recent renaming from SH to SB
with BUILD_BFLOAT16.
2020-10-13 11:00:22 -05:00
Martin Kroeker
137ae618db Fix typo 2020-10-13 15:02:17 +02:00
Martin Kroeker
9e3cff5cf2 Expressly enable -mavx2 on Zen, SkylakeX and Cooperlake as well 2020-10-13 14:41:25 +02:00
Martin Kroeker
d85b968424 Merge pull request #2891 from martin-frbg/fix-2886
Fix several bugs and omissions from the BFLOAT16 rename
2020-10-13 13:46:17 +02:00
Martin Kroeker
5f60a32cac Add -mssse3 if supported by the hardware 2020-10-13 11:57:04 +02:00
Martin Kroeker
fecedc9c69 Add -mssse3 2020-10-13 11:55:41 +02:00
Martin Kroeker
0eacbca85f Add Haswell and Zen to temporary sse3 whitelist 2020-10-13 11:42:39 +02:00
Martin Kroeker
6999086a2b whitelist SANDYBRIDGE for SSE3 2020-10-13 10:32:19 +02:00
Martin Kroeker
9dca578c79 Cleanup 2020-10-13 10:14:08 +02:00
Martin Kroeker
1e7eb7b7a9 Fix typos in currently unused sections 2020-10-13 09:17:15 +02:00
Martin Kroeker
84949754a0 Fix bfloat16 conditional 2020-10-13 09:11:36 +02:00
Martin Kroeker
2ae8785603 Add a POWER9 build with BFLOAT16 enabled 2020-10-13 09:07:50 +02:00
Martin Kroeker
e05af6575e Fix some overlooked "SHBLAS" entries 2020-10-13 09:05:04 +02:00
Martin Kroeker
c1643006ae Merge pull request #97 from xianyi/develop
rebase
2020-10-13 09:01:49 +02:00
Martin Kroeker
8d2df7d066 Revert special handling of Windows xNRM2 and enable C+intrinsics kernel for SSUM/DSUM 2020-10-13 00:14:29 +02:00
Martin Kroeker
08929430cd Merge pull request #2886 from martin-frbg/issue_2767
Rename "HALF" precision functions (sh prefix) to "BFLOAT16" with "sb" prefix
2020-10-13 00:04:35 +02:00
Martin Kroeker
0c84ffe05f Merge pull request #2881 from mattip/fninit
add fninit to reset fpu registers before assembler routines
2020-10-12 23:50:41 +02:00
Martin Kroeker
cb4274e3ad Merge pull request #2888 from Qiyu8/usimd-sum
Optimize the performance of sum by using universal intrinsics
2020-10-12 23:22:08 +02:00
Matti Picus
403eb513a0 use emms instead, add WIN guards 2020-10-12 18:15:01 +03:00
Martin Kroeker
cb839575ed Convert the prototypes of the unimplemented BFLOAT16 functions to the new naming scheme 2020-10-12 14:44:33 +02:00
Qiyu8
0ed1f07660 Optimize the performance of sum by using universal intrinsics 2020-10-12 19:48:53 +08:00
Martin Kroeker
bb74dd29db Restore -msse3 2020-10-12 00:42:05 +02:00
Martin Kroeker
629c497b6c common_sh.h renamed to common_sb.h 2020-10-12 00:27:11 +02:00
Martin Kroeker
2c552f1074 Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:11:31 +02:00
Martin Kroeker
7ae9e8960e Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:08:29 +02:00
Martin Kroeker
e3a29f6b58 Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:07:37 +02:00
Martin Kroeker
006c7f6671 Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:06:06 +02:00
Martin Kroeker
85154c2e18 Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:05:05 +02:00
Martin Kroeker
ae1ab5bfdf Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:03:21 +02:00
Martin Kroeker
052f31bc3c Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:02:16 +02:00
Martin Kroeker
3aecafad80 Change "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-12 00:00:55 +02:00
Martin Kroeker
756062afa5 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:56:17 +02:00
Martin Kroeker
2061f7fdff Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:54:53 +02:00
Martin Kroeker
dc8a1afa63 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:53:50 +02:00
Martin Kroeker
32733ded04 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:52:45 +02:00
Martin Kroeker
3bc8e8c334 Rename "HALF" and "sh" to "BFLOAT16"and "sb" 2020-10-11 23:51:34 +02:00
Martin Kroeker
573508f0ee Rename common_sh.h to common_sb.h 2020-10-11 23:50:54 +02:00
Martin Kroeker
ca31c32693 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:49:22 +02:00
Martin Kroeker
5800758b43 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:44:38 +02:00
Martin Kroeker
924fd806d0 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:43:36 +02:00
Martin Kroeker
4db09c6cec Rename compare_sgemm_shgemm.c to compare_sgemm_sbgemm.c 2020-10-11 23:42:45 +02:00
Martin Kroeker
fd94236042 Rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:42:07 +02:00
Martin Kroeker
68ce719fac Rename shdot_microk_cooperlake.c to sbdot_microk_cooperlake.c 2020-10-11 23:41:13 +02:00
Martin Kroeker
d7dd9b396c Rename shdot.c to sbdot.c 2020-10-11 23:40:43 +02:00
Martin Kroeker
9ae80490e0 rename "HALF" and "sh" to "BFLOAT16" and "sb" 2020-10-11 23:39:42 +02:00
Martin Kroeker
d314d1f49f Rename shgemm_kernel_power10.c to sbgemm_kernel_power10.c 2020-10-11 23:37:38 +02:00
Martin Kroeker
f0883740e4 Merge pull request #96 from xianyi/develop
rebase
2020-10-11 23:34:36 +02:00
Martin Kroeker
1c0b03efb4 Merge branch 'develop' into develop 2020-10-11 23:34:14 +02:00
Martin Kroeker
c589c3e2a1 Merge pull request #2882 from martin-frbg/issue2709
Use generic C for (D/Z)NRM2 on Windows x86_64
2020-10-11 22:22:30 +02:00
Martin Kroeker
ec638a82bf Merge pull request #2852 from martin-frbg/issue2588-cmake
Support building only a subset of variable types
2020-10-11 22:21:33 +02:00
Martin Kroeker
caa0d757ca repair TABs 2020-10-11 18:29:34 +02:00
Martin Kroeker
6154f72d6d Copy BUILD_ settings to the LAPACK make.inc 2020-10-11 18:25:16 +02:00
Martin Kroeker
ae8b0d257a Set BUILD_ options to 1 instead of just defining them 2020-10-11 18:08:21 +02:00
Martin Kroeker
1da32cc1fc Add cblas_xerbla interface 2020-10-11 17:45:41 +02:00
Martin Kroeker
8c5e08076e If none of the BUILD_ options is set, enable them all 2020-10-11 17:33:51 +02:00
Martin Kroeker
5f23bdf437 remove debug output 2020-10-11 17:23:08 +02:00
Martin Kroeker
b593e6b650 Merge pull request #2885 from martin-frbg/ifexists
Improve CMAKE check for conflicting config_kernel.h
2020-10-11 15:45:24 +02:00
Martin Kroeker
082c86a538 Merge pull request #2884 from martin-frbg/sse_fixup
Add workaround for unwanted default activation of -msse3 in DYNAMIC_ARCH builds
2020-10-11 15:14:03 +02:00
Martin Kroeker
e396ec8b56 Allow building support for only a subset of variable types 2020-10-11 15:11:15 +02:00
Martin Kroeker
68e6823d36 Adapt for supporting only a subset of variable types 2020-10-11 15:01:32 +02:00
Martin Kroeker
887e00fd7f Adapt for supporting only a subset of variable types 2020-10-11 14:58:57 +02:00
Martin Kroeker
886a8e3190 Adapt for supporting only a subset of variable types 2020-10-11 14:57:32 +02:00
Martin Kroeker
0f7d73ff6d Allow supporting only a subset of variable types 2020-10-11 14:53:26 +02:00
Martin Kroeker
6b6adf8a4a Allow compiling only a subset of kernels for specific variable types 2020-10-11 14:52:09 +02:00
Martin Kroeker
a6570108c5 Add Makefile support for enabling only some variable types 2020-10-11 14:49:58 +02:00
Martin Kroeker
ef552bc578 Add Makefile support for enabling only some variable types 2020-10-11 14:49:06 +02:00
Martin Kroeker
efe1ad4700 Add Makefile support for enabling only some variable types 2020-10-11 14:48:23 +02:00
Martin Kroeker
b27ca78a21 Adapt to having only a subset of variable types supported 2020-10-11 14:46:24 +02:00
Martin Kroeker
93454022a9 Adapt to having only a subset of variable types supported 2020-10-11 14:45:40 +02:00
Martin Kroeker
20cf1d773f Adapt to having only a subset of variable types supported 2020-10-11 14:44:56 +02:00
Martin Kroeker
5c657fffad Adapt to having only a subset of variable types supported 2020-10-11 14:44:13 +02:00
Martin Kroeker
b262058059 Adapt to having only a subset of variable types supported 2020-10-11 14:43:13 +02:00
Martin Kroeker
bc319cee82 Adapt to having only a subset of variable types supported 2020-10-11 14:42:26 +02:00
Martin Kroeker
e5966f8606 Adapt to having only a subset of variable types supported 2020-10-11 14:41:43 +02:00
Martin Kroeker
9df12eb08f Adapt to having only a subset of variable types supported 2020-10-11 14:40:51 +02:00
Martin Kroeker
cf53970bcb Adapt to having only a subset of variable types supported 2020-10-11 14:40:06 +02:00
Martin Kroeker
dcd51d5c72 Adapt to having only a subset of variable types supported 2020-10-11 14:39:19 +02:00
Martin Kroeker
b8f95354c7 Adapt to having only a subset of variable types supported 2020-10-11 14:38:25 +02:00
Martin Kroeker
d33de97d60 Adapt to having only a subset of variable types supported 2020-10-11 14:36:45 +02:00
Martin Kroeker
6a83c591d6 Adapt for having only a subset of variable types 2020-10-11 14:34:12 +02:00
Martin Kroeker
f6d2827d0c Adapt ctests to having only a subset of types in the build 2020-10-11 14:32:00 +02:00
Martin Kroeker
08f4749eb4 Adapt tests to having only a subset of types in the build 2020-10-11 14:25:24 +02:00
Martin Kroeker
63d7dad04c Adapt utests for builds supportin only some variable types 2020-10-11 14:15:35 +02:00
Martin Kroeker
ac653c94f3 Merge branch 'develop' into issue2588-cmake 2020-10-11 13:57:07 +02:00
Martin Kroeker
190b74dd24 Add files via upload 2020-10-11 13:26:05 +02:00
Martin Kroeker
9d43140d61 Improve check for conflicting config_kernel.h 2020-10-11 12:58:17 +02:00
Martin Kroeker
8ef600f1a3 Merge pull request #95 from xianyi/develop
rebase
2020-10-11 12:53:18 +02:00
Martin Kroeker
88928650c4 Merge pull request #2883 from martin-frbg/issue2872
Minor CMAKE fixes
2020-10-11 10:30:33 +02:00
Martin Kroeker
7a53128481 Add whitelist of DYNAMIC_ARCH kernels for which -msse3 needs to be enabled 2020-10-11 01:06:46 +02:00
Martin Kroeker
0c773b8205 Do not rely on HAVE_SSE3 in DYNAMIC_ARCH builds 2020-10-11 01:04:57 +02:00
Martin Kroeker
fbda20c856 Merge pull request #94 from xianyi/develop
rebase
2020-10-11 01:03:00 +02:00
Martin Kroeker
82a497ec5d restore PRESCOTT default for DYNAMIC_LIST 2020-10-11 00:43:09 +02:00
Martin Kroeker
de27e4f5fb Stop DYNAMIC_ARCH build if the toplevel source contains a stray config_kernel.h from a gmake build
This is unlikely to happen in practice, but if it does, the rogue file would get included instead of the dynamically generated version for each target_core, leading to very confusing errors like "invalid operands (undefined UND and ABS sections)" in compilation of the assembly kernels as macros like PREFETCH would remain undefined
2020-10-11 00:40:22 +02:00
Martin Kroeker
e1b7123bbe Merge pull request #2867 from Qiyu8/usimd-floatdot
Optimize the performance of dot by using universal intrinsics in X86/ARM
2020-10-10 12:10:25 +02:00
Qiyu8
f32d34a015 add sse3 compiler flag 2020-10-10 10:36:15 +08:00
Martin Kroeker
599777ecb7 Merge pull request #2879 from martin-frbg/issue2839
Default BLAS3_MEM_ALLOC_THRESHOLD on all platforms to 32
2020-10-06 23:26:52 +02:00
Martin Kroeker
7812486091 Use generic C for D/Z nrm2 kernels on Windows to work around fpu exception bug 2020-10-06 21:33:16 +02:00
Matti Picus
a5b164946c add fninit to reset fpu registers before assembler routines 2020-10-05 22:13:25 +03:00
Martin Kroeker
a5feea6611 make BLAS3_MEM_ALLOC_THRESHOLD configurable on non-Windows 2020-10-04 23:01:06 +02:00
Martin Kroeker
dc8e4e1959 Reduce the BLAS3 heap allocation threshold to 32 and mark it as configurable 2020-10-04 22:59:24 +02:00
Martin Kroeker
cccd1438da Merge pull request #93 from xianyi/develop
rebase
2020-10-04 22:57:11 +02:00
Martin Kroeker
f032d8966e Merge pull request #2874 from Flamefire/memory_fixes
Avoid out of bounds access on invalid memory free
2020-10-04 15:16:51 +02:00
Martin Kroeker
f6e4cf2f9d Merge pull request #2876 from Flamefire/omp_fork_fix
Lazyly reinit threads after a fork in OMP mode
2020-10-03 22:52:17 +02:00
Martin Kroeker
9828343e12 Merge pull request #2878 from brada4/asms
fix clang std=c18 compilation on aarch64
2020-10-03 22:51:49 +02:00
User User-User
d2333e7842 aarch64 fix std=c18 compilation 2020-10-03 18:00:34 +03:00
Alexander Grund
3094fc6c83 Lazyly reinit threads after a fork in OMP mode
This initializes the per-thread memory buffers which get
cleared/released on a fork via pthread_at_fork. Not doing so leads to
each thread calling blas_memory_alloc on almost every execution which
slows down the code significantly as the threads race for the memory
allocation using locks to serialize that.
2020-10-01 15:41:42 +02:00
Alexander Grund
3c05f54df8 Avoid out of bounds access on invalid memory free 2020-10-01 10:48:45 +02:00
Alexander Grund
dee7c49938 Fix TABs and trailing space 2020-10-01 10:43:16 +02:00
Martin Kroeker
d3c0d6811b Merge pull request #2873 from martin-frbg/issue2871
Check for __linux rather than linux in cpuid code and benchmarks
2020-10-01 06:38:22 +02:00
Martin Kroeker
9637cd1fd1 Merge pull request #2865 from thisch/backticks
Consolidate usage of backticks for build options
2020-10-01 06:38:06 +02:00
Martin Kroeker
2367726578 Remove redundant status message 2020-09-30 23:28:49 +02:00
Martin Kroeker
5464eb13ea Change ifdef linux to __linux for C11 compatibility 2020-09-30 22:59:41 +02:00
Martin Kroeker
e1574cbc83 Change ifdef linux to __linux for C11 compatibility
and add a fallback for unsupported operating systems in detect()
2020-09-30 22:50:21 +02:00
Martin Kroeker
0b2bb5696a Change ifdef linux to __linux for C11 compatibility 2020-09-30 22:47:25 +02:00
Martin Kroeker
a7d5d0078d Change ifdef linux to __linux for C11 compatibility 2020-09-30 22:46:25 +02:00
Martin Kroeker
be40440ec5 Change ifdef linux to __linux for C11 compatibility 2020-09-30 22:45:18 +02:00
Martin Kroeker
2bf70c8e3b Change ifdef linux to __linux for C11 compatibility 2020-09-30 22:43:25 +02:00
Qiyu8
60e6c68e38 Adapt ARM architect 2020-09-29 16:36:14 +08:00
Martin Kroeker
64629cb5c7 Merge pull request #91 from xianyi/develop
rebase
2020-09-28 22:48:53 +02:00
Qiyu8
1b1a757f5f Optimize the performance of dot by using universal intrinsics in X86/ARM 2020-09-28 20:36:53 +08:00
Martin Kroeker
0d98ce202c Merge pull request #2866 from RajalakshmiSR/p10_dcopy
Optimize dcopy/zcopy for POWER10
2020-09-28 07:22:54 +02:00
Rajalakshmi Srinivasaraghavan
2df4235e00 Optimize dcopy/zcopy for POWER10
This patch makes use of new POWER10 vector pair instructions for
loads and stores. Tested in simulator and no new failures.
2020-09-27 21:42:32 -05:00
Thomas Hisch
fe8cd5ae7e Consolidate usage of backticks for build options
There were some build options in the README that were not
highlighted. Now all are highlighted.
2020-09-28 00:42:17 +02:00
Martin Kroeker
ba31c8f5f9 Merge pull request #2853 from Qiyu8/usimd-daxpy
Optimize the performance of daxpy by using universal intrinsics
2020-09-27 23:19:59 +02:00
Martin Kroeker
e961d4d609 Merge pull request #2864 from martin-frbg/lapack445
FIx underflow/rounding errors in LAPACK (S,D)LANV2
2020-09-27 23:11:17 +02:00
Martin Kroeker
7ed25e9e10 FIx underflow/rounding errors in LAPACK (S,D)LANV2
Reference-LAPACK PR 445, fixing their issue 263
2020-09-27 22:59:20 +02:00
Martin Kroeker
7b169379e0 Merge pull request #2863 from martin-frbg/readmefixes
Readmefixes
2020-09-27 22:50:25 +02:00
Martin Kroeker
7f539fb850 Update cpu list, outline cmake build, clarify scope of set_num_threads extension 2020-09-27 22:48:41 +02:00
Martin Kroeker
caf7a12295 Merge pull request #90 from xianyi/develop
rebase
2020-09-27 22:35:45 +02:00
Martin Kroeker
72b5b73647 Merge pull request #2850 from xiaojiayuan111/develop
fix a bug of trmm
2020-09-27 12:12:35 +02:00
Qiyu8
881c15179f remove default support for FMA4 on zen architect 2020-09-27 09:35:50 +08:00
Martin Kroeker
896bbd55e1 Add support for building only selected variable types 2020-09-26 23:25:55 +02:00
Martin Kroeker
c5a32288c6 Work around sgemm_r/dgemm_r not being properly defined with BUILD_COMPLEX/BUILD_COMPLEX16 2020-09-26 23:24:37 +02:00
Martin Kroeker
dfaafd3b55 Merge pull request #2854 from martin-frbg/travis-graviton
Add an AWS-Graviton2 build to Travis CI
2020-09-23 21:59:18 +02:00
Martin Kroeker
f2e9a24e1a Add AWS Graviton2 build 2020-09-23 19:02:20 +02:00
Martin Kroeker
98153875e9 Adapt tests to having only a subset of types in the library 2020-09-22 23:28:57 +02:00
Martin Kroeker
0eaae30e8c Adapt tests to having only a subset of types in the build 2020-09-22 23:28:03 +02:00
Martin Kroeker
dfbc62ef7e Support building only a subset of types 2020-09-22 23:25:59 +02:00
Martin Kroeker
b475b4bd0d Support building only a subset of types 2020-09-22 23:25:04 +02:00
Martin Kroeker
357bff06b5 Add BUILD_vartype defines 2020-09-22 23:24:22 +02:00
Martin Kroeker
988a6f429e Add BUILD_vartype defines 2020-09-22 23:23:33 +02:00
Martin Kroeker
e5e2fbd593 Support building only selected types 2020-09-22 23:21:30 +02:00
Martin Kroeker
3287848c8f Support building only seleced types 2020-09-22 23:20:51 +02:00
Martin Kroeker
26611af8e1 fix grouping of sources used for more than one type 2020-09-22 23:20:05 +02:00
Martin Kroeker
b886bd672b add defines for building a subset of types 2020-09-22 23:18:55 +02:00
Martin Kroeker
61fae59298 Merge pull request #88 from xianyi/develop
rebase
2020-09-22 23:15:33 +02:00
Martin Kroeker
33d22f99f1 Merge pull request #2851 from martin-frbg/travis-xcode12
Add an OSX build with xcode12
2020-09-22 21:44:55 +02:00
Martin Kroeker
5ba01dd1a8 Add an OSX build with xcode12 2020-09-22 17:26:19 +02:00
Qiyu8
14f7dad3b7 performance improved 2020-09-22 16:52:15 +08:00
y00512012
06cf73a239 fix a bug of trmm 2020-09-22 16:47:10 +08:00
Qiyu8
325b539c26 Optimize the performance of daxpy by using universal intrinsics 2020-09-22 10:38:35 +08:00
Martin Kroeker
0f112077e6 Merge pull request #2847 from mhillenibm/fixup_cscal
s390x: fix cscal and zscal implementations
2020-09-21 22:22:43 +02:00
Marius Hillenbrand
22aa81f3e5 s390x: fix cscal and zscal implementations
The implementation of complex scalar * vector multiplication for Z14
makes some LAPACK tests fail because the numerical differences to the
reference implementation exceed the threshold (as can be seen by running
make lapack-test and replacing kernel/zarch/cscal.c with a generic
implementation for comparison).

The complex multiplication uses terms of the form a * b + c * d for both
real and imaginary parts. The assembly code (and compiler-emitted code
as well) uses fused multiply add operations for the second product and
sum. The results can be "surprising", for example when both terms in the
imaginary part nearly cancel each other out. In that case, the second
product contributes more digits to the sum than the first product that
has been rounded before.

One option is to use separate multiplications (which then round the same
way) and a distinct add. Change the code to pursue that path, by (1)
requesting the compiler not to contract the operations into FMAs and (2)
replacing the assembly kernel with corresponding vectorized C code
(where change 1 also applies).

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-21 13:10:05 +02:00
Marius Hillenbrand
77ea73f5e5 s390x: for clang use fp-contract=on instead of fast
Make clang slightly more cautious when contracting floating-point
operations (e.g., when applying fused multiply add) by setting
-ffp-contract=on (instead of fast).

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-21 11:32:08 +02:00
Marius Hillenbrand
f91057cbad s390x: move common vector definitions and utils into header
... to facilitate reuse beyond gemm_vec.c and avoid code duplication.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-21 11:32:08 +02:00
Martin Kroeker
992d7ca63d Merge pull request #2845 from martin-frbg/lapack443
Fix workspace query in LAPACK xGELQ (Reference-LAPACK 443)
2020-09-18 23:18:41 +02:00
Martin Kroeker
7e4d5c237c Fix workspace query in xGELQ (Reference-LAPACK PR443) 2020-09-18 09:19:46 +02:00
Martin Kroeker
8d12027a79 Merge pull request #86 from xianyi/develop
rebase
2020-09-18 09:17:49 +02:00
Martin Kroeker
b1e0bcceec Merge pull request #2844 from RajalakshmiSR/daxpy_p10
Optimize daxpy/zaxpy for POWER10
2020-09-17 23:46:32 +02:00
Rajalakshmi Srinivasaraghavan
be43d2cb96 Optimize daxpy/zaxpy for POWER10
This patch makes use of new POWER10 vector pair instructions for
loads and stores. Tested in simulator and no new failures.
2020-09-17 12:56:28 -05:00
Martin Kroeker
2855e6000c Merge pull request #2841 from martin-frbg/cpp_gemvtest
Make thread safety tests available to CMAKE and support running only the GEMV version
2020-09-17 17:29:56 +02:00
Martin Kroeker
144a03446d Merge pull request #2843 from mhillenibm/fixup_merge_dynamic_zarch
s390x/DYNAMIC_ARCH: fixup broken merge and reapply simplification
2020-09-17 17:28:43 +02:00
Marius Hillenbrand
75d440caa0 s390x/DYNAMIC_ARCH: fixup broken merge and reapply simplification
An unrelated commit and merge inadvertently reverted our recent two
changes for simplifying DYNAMIC_ARCH on s390x. Simply reapply the
changes.

Simplify detection of which kernels we can compile on s390x. Instead of
decoding the gcc version in a complicated manner, just check if CC
supports a given -march=archXY flag. Together with the next patch, we
thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1.

To enable builds with DYNAMIC_ARCH with older compiler releases, the
Makefile and drivers/other/dynamic_arch.c need a common view of the
architecture support built into the library.

We follow the notation from x86 when used with DYNAMIC_LIST, where
defines DYN_<ARCH NAME> denote support for a given generation to be
built in. Since there are far fewer architecture generations in OpenBLAS
for s390x, that does not bloat command lines too much.

Closes: #2842
Fixes: ba644378dc ("Copy BUILD_ options available to the compiler flags"

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-17 17:09:03 +02:00
Martin Kroeker
6abca76c4e Add option for running only the less demanding GEMV version of the thread safety tests 2020-09-17 13:49:24 +02:00
Martin Kroeker
84c00c3c6e Support running just the GEMV version of the thread safety test 2020-09-17 13:46:41 +02:00
Martin Kroeker
8c5c991bd7 Add cpp_thread_test options 2020-09-17 13:45:40 +02:00
Martin Kroeker
2e3b15d68b Add CMakeLists.txt 2020-09-17 13:43:55 +02:00
Martin Kroeker
eaf7f825bd Merge pull request #85 from xianyi/develop
rebase
2020-09-17 13:42:47 +02:00
Martin Kroeker
4c10a1673d Merge pull request #2840 from martin-frbg/fixup2833
Fix for cmake BUILD_ settings PR 2833
2020-09-16 18:55:50 +02:00
Martin Kroeker
c4aeeeb9f4 Activate all BUILD_ options if none was specified 2020-09-15 23:15:34 +02:00
Martin Kroeker
3843bd188c Merge pull request #84 from xianyi/develop
rebase
2020-09-15 23:13:30 +02:00
Martin Kroeker
ddec244a5a Merge pull request #2838 from austinpagan/gordon_trmm
Adding performance patch for trmm, just like trsm (#2836)
2020-09-15 21:17:48 +02:00
fossum
dfeca46098 Adding performance patch for trmm, just like #2836 2020-09-15 08:59:50 -05:00
Martin Kroeker
f8950f40a2 Merge pull request #2836 from austinpagan/gordon_trsm
Fixing a performance bug in trsm_[LR].c.
2020-09-15 11:26:37 +02:00
fossum
274d6e015b Fixing a performance bug in trsm_[LR].c. 2020-09-14 13:10:48 -05:00
Martin Kroeker
91c84e1c01 Merge pull request #2796 from Guobing-Chen/BF16_dot_coversion_apis
Add bfloat16 based dot and conversion with single/double
2020-09-14 15:00:19 +02:00
Martin Kroeker
1ee1e7b495 Merge pull request #2833 from martin-frbg/issue2830
Make building the tests for individual data types conditional on the respective BUILD option
2020-09-14 07:24:23 +02:00
Martin Kroeker
ba644378dc Copy BUILD_ options available to the compiler flags 2020-09-14 00:03:33 +02:00
Martin Kroeker
9e11c2d62f Add BUILD_SINGLE etc 2020-09-13 23:55:11 +02:00
Martin Kroeker
4d250d0cdf Rearrange ifdefs 2020-09-13 23:29:01 +02:00
Martin Kroeker
de139337b8 Remove spurious tests for complex ASUM and NRM2 2020-09-13 22:20:41 +02:00
Martin Kroeker
ec2948f147 Make tests conditional on BUILD_DOUBLE 2020-09-13 22:17:46 +02:00
Martin Kroeker
ce89398636 Make tests for individual variable types conditional on the respective BUILD_ option 2020-09-13 21:52:18 +02:00
Martin Kroeker
593ce9e237 Make building individual tests depend on BUILD_SINGLE etc defines 2020-09-13 21:50:12 +02:00
Martin Kroeker
74e358bcd5 Remove spurious complex16 tests 2020-09-13 21:49:01 +02:00
Martin Kroeker
26792d2096 Copy BUILD_* directives to the compiler options to allow ifdef in tests 2020-09-13 21:47:55 +02:00
Martin Kroeker
6b52c7e172 Merge pull request #2832 from martin-frbg/issue2831
Fix gfortran detection by vendor matching
2020-09-13 21:20:30 +02:00
Martin Kroeker
746ad3bd19 Fix vendor match for GCC gfortran 2020-09-13 18:40:59 +02:00
Martin Kroeker
55d4d470ec Merge pull request #83 from xianyi/develop
rebase
2020-09-13 18:30:11 +02:00
Martin Kroeker
a270894730 Merge pull request #2829 from mhillenibm/clang_s390x
Fix DYNAMIC_ARCH=1 with clang s390x
2020-09-08 23:36:41 +02:00
Marius Hillenbrand
047b8d7aff Add an s390 build with clang to the Travis configuration
Since clang builds have been fixed on s390x, including support for
DYNAMIC_ARCH, cover that build type in Travis.

Explicitly request Ubuntu 20.04 (codename focal) to get a recent
LLVM/clang version 10.x and thereby cover all s390x architecture
generations supported in OpenBLAS. Ubuntu 18.10's LLVM/clang 6.x cannot
build the inline assembly in some of the Z13 and Z14 kernels.

LLVM/clang currently does not support OpenMP on s390x, so disable that
in the build.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-08 20:59:06 +02:00
Marius Hillenbrand
f7731a358a Update CONTRIBUTERS.md - clang build fixes for IBM z
Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-08 19:34:18 +02:00
Marius Hillenbrand
a55fe06f25 s390x/DYNAMIC_ARCH: define a HW_CAP flag to support slightly older glibc versions
Enable building DYNAMIC_ARCH support with older versions of glibc that
do not know about the hwcap flag HWCAP_S390_VXE yet.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-08 19:34:18 +02:00
Marius Hillenbrand
4f34bcfb5e s390x/DYNAMIC_ARCH: pass supported arch levels from Makefile to run-time code
... instead of duplicating the (old) mechanism from the Makefile that
aimed to derive supported architecture generations from the gcc
version.

To enable builds with DYNAMIC_ARCH with older compiler releases, the
Makefile and drivers/other/dynamic_arch.c need a common view of the
architecture support built into the library.

We follow the notation from x86 when used with DYNAMIC_LIST, where
defines DYN_<ARCH NAME> denote support for a given generation to be
built in. Since there are far fewer architecture generations in OpenBLAS
for s390x, that does not bloat command lines too much.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-08 19:34:18 +02:00
Marius Hillenbrand
0629d8ebdb s390x/DYNAMIC_ARCH: generalize detecting supported archs for clang
Simplify detection of which kernels we can compile on s390x. Instead of
decoding the gcc version in a complicated manner, just check if CC
supports a given -march=archXY flag. Together with the next patch, we
thereby gain support for builds with LLVM/clang with DYNAMIC_ARCH=1.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-08 19:34:18 +02:00
Martin Kroeker
15da2f9acb Merge pull request #2828 from martin-frbg/lapack438
Correct xLASET arguments in LAPACK EIG tests
2020-09-08 10:25:19 +02:00
Martin Kroeker
7d9c77f421 Correct dimension argument to xLASET
from Reference-LAPACK PR 438
2020-09-07 22:03:46 +02:00
Martin Kroeker
c8f029a518 Merge pull request #82 from xianyi/develop
rebase
2020-09-07 21:59:13 +02:00
Martin Kroeker
e72430fe46 Merge pull request #2803 from xiegengxin/AVX2-asum
Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic
2020-09-06 18:32:15 +02:00
Martin Kroeker
6e0f6c5f00 Merge pull request #2824 from martin-frbg/asumbench
Use POSIX2001 clock.gettime in asum benchmark if available
2020-09-06 10:05:47 +02:00
Martin Kroeker
6f8fad87c5 Use POSIX2001 clock.gettime for higher resolution 2020-09-05 19:44:01 +02:00
Martin Kroeker
ed0f2d3dd7 Merge pull request #2816 from martin-frbg/silicon
Add basic support for Apple Vortex (ARM64) cpu
2020-09-05 19:17:59 +02:00
Martin Kroeker
43a31b7786 Merge pull request #2823 from martin-frbg/fix2778
Improve fix for lapack-test EIG/cchkhb2stg from PR 2778
2020-09-05 17:29:38 +02:00
Martin Kroeker
8a2a137a9e Correct argument to SLASET (Improves fix from PR2778)
as explained by serguei-patchkovskii in Reference-LAPACK/lapack#438 (comment) , passing in an index of 1 instead of N leads to a standards violation accessing matrix A in SLASET, i.e. undefined behavior
2020-09-05 13:06:31 +02:00
Martin Kroeker
0d1f30a297 Merge pull request #81 from xianyi/develop
rebase
2020-09-05 12:47:03 +02:00
Martin Kroeker
70a254d507 Merge pull request #2822 from martin-frbg/issue2821
Fix potential domain error in sqrt
2020-09-05 12:39:32 +02:00
Martin Kroeker
330044d821 Fix potentiol domain error in sqrt 2020-09-05 09:44:33 +02:00
Martin Kroeker
97636b2c8a Merge pull request #2819 from h-vetinari/carry_lapack_437
Carry lapack#437
2020-09-04 23:50:43 +02:00
Martin Kroeker
4d36711547 Merge pull request #2820 from RajalakshmiSR/clang
POWER9: Fix mcpu option with clang
2020-09-04 23:09:31 +02:00
Rajalakshmi Srinivasaraghavan
718f67421a POWER9: Fix mcpu option with clang
Adding check for compiler type before checking GCC version in Makefile.
This allows clang to use power9 instead of power8 when CORE is POWER9.
2020-09-04 10:36:19 -05:00
H. Vetinari
3426519ae2 adapt ?ggsv?-functions to ambient code style in LAPACKE/include/lapack.h 2020-09-04 17:33:24 +02:00
H. Vetinari
1c6c71fa85 Follow-up to lapack#434 & lapack#409: add missing 'const' in signatures
Based on how the surrounding functions in lapack.h are handling the
parameters, particularly the ?ggsv?3-variants of the affected functions
2020-09-04 17:33:11 +02:00
H. Vetinari
860247b5da Follow-up to lapack#434 & lapack#409: fix signature mismatches 2020-09-04 17:32:53 +02:00
Martin Kroeker
c61771e335 Merge pull request #2778 from martin-frbg/lapackeig
Fix various wrong calls to SLASET/DLASET in the EIG part of the LAPACK testsuite
2020-09-04 10:06:02 +02:00
Chen, Guobing
deaeb6c5b8 Add bfloat16 based dot and conversion with single/double
1. Added bfloat16 based dot as new API: shdot
2. Implemented generic kernel and cooperlake-specific (AVX512-BF16) kernel for shdot
3. Added 4 conversion APIs for bfloat16 data type <=> single/double: shstobf16 shdtobf16 sbf16tos dbf16tod
     shstobf16 -- convert single float array to bfloat16 array
     shdtobf16 -- convert double float array to bfloat16 array
     sbf16tos  -- convert bfloat16 array to single float array
     dbf16tod  -- convert bfloat16 array to double float array
4. Implemented generic kernels for all 4 conversion APIs, and cooperlake-specific kernel for shstobf16 and shdtobf16
5. Update level1 thread facilitate functions and macros to support multi-threading for these new APIs
6. Fix Cooperlake platform detection/specify issue when under dynamic-arch building
7. Change the typedef of bfloat16 from unsigned short to more strict uint16_t

Signed-off-by: Chen, Guobing <guobing.chen@intel.com>
2020-09-04 02:31:25 +08:00
Martin Kroeker
c7ef7174e4 Merge pull request #2817 from martin-frbg/lapack436
LAPACKE: fix declaration of work arrays in [cz]gesvdq
2020-09-03 17:10:23 +02:00
Martin Kroeker
775a87242d Rename KERNEL.SILICON to KERNEL.VORTEX 2020-09-03 08:44:20 +02:00
Martin Kroeker
af5bc95503 Rename SILICON to VORTEX and fix duplicate numbering 2020-09-03 08:43:26 +02:00
Martin Kroeker
ea3a58c844 Rename SILICON to VORTEX 2020-09-03 08:38:53 +02:00
Martin Kroeker
17dca035de rename SILICON to VORTEX 2020-09-03 08:38:08 +02:00
Gengxin Xie
1b0f17eeed align to 64, using SSE when input size is small 2020-09-03 14:25:54 +08:00
Martin Kroeker
c31b72965e Fix data type of work array in zgesvdq prototype 2020-09-02 23:44:44 +02:00
Martin Kroeker
0ce2aa3163 Fix data type of rwork array 2020-09-02 23:41:51 +02:00
Martin Kroeker
80794fe8fd Create KERNEL.SILICON 2020-09-02 22:56:58 +02:00
Martin Kroeker
4a4d1ca6e0 Add AppleSIlicon cpu 2020-09-02 22:52:12 +02:00
Martin Kroeker
b37d17382a Add Apple Silicon 2020-09-02 22:48:49 +02:00
Martin Kroeker
029fd01cfb Detect AppleSilicon cpu on OSX 2020-09-02 22:47:38 +02:00
Martin Kroeker
9d1ea75aa0 Merge pull request #80 from xianyi/develop
rebase
2020-09-02 22:16:41 +02:00
Martin Kroeker
776d005f4c Merge pull request #2815 from mhillenibm/clang_s390x
Fix build with clang on s390x
2020-09-02 16:56:01 +02:00
Marius Hillenbrand
2ee5b899ce s390x: enable S/DGEMM block with explicit loop unrolling + interleaving with clang
The code for SGEMM 16x4 and DGEMM 8x4 blocks on z14 and z15 uses
explicit unrolling and interleaving to improve performance. The code
employs an empty inline asm statement with operands that constrain the
compiler's instruction scheduling and thereby enforce proper overlapping
of load and compute phases. Fix an ifdef to apply that for clang builds,
as well.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:31 +02:00
Marius Hillenbrand
095f4e6964 s390x: allow clang to emit fused multiply-adds (replicates gcc's default behavior)
gcc's default setting for floating-point expression contraction is
"fast", which allows the compiler to emit fused multiply adds instead of
separate multiplies and adds (amongst others). Fused multiply-adds,
which assembly kernels typically apply, also bring a significant
performance advantage to the C implementation for matrix-matrix
multiplication on s390x. To enable that performance advantage for builds
with clang, add -ffp-contract=fast to the compiler options.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:31 +02:00
Marius Hillenbrand
87e5bbd887 s390x: avoid variable-length arrays in struct for asm operands
... since it is not required and clang does not support that gcc
extension. Instead, use a variable-length array directly for these
operands.

Note that, while the actual inline assembly code does not directly use
these memory operands, they serve to inform the compiler that it cannot
reorder reads or writes to/from the input and output data across the
inline asm statements.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:31 +02:00
Marius Hillenbrand
b9b3265ec8 s390x: avoid inline assembly for vector loads for clang
... since clang does not support the instruction format for inline
assembly and also it is not required for current versions of clang.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:30 +02:00
Marius Hillenbrand
a1616a0b86 s390x: replace nop with "nop 0" in inline assembly
... as a bandaid for building with clang until LLVM's internal assembler
supports nops without operand.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:30 +02:00
Marius Hillenbrand
60ef193258 s390x: use "lghi" for immediate values to fix build with clang
Some of the kernels written in assembly utilize a "load address"
instruction for loading an immediate value into a register. That is
both unnecessarily complex and LLVM's assembler does not understand that
specific syntax. Thus, replace with the appropriate "load immediate"
instruction, which is also clearer to read.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-09-02 13:49:30 +02:00
Martin Kroeker
18bfb6d6f7 Merge pull request #2813 from martin-frbg/issue2804-2
Fix for c_check misinterpreting arm64 in uname -m output as armv7
2020-09-01 23:39:46 +02:00
Martin Kroeker
e4900caa11 Fix c_check misinterpreting arm64 in uname output to mean armv7
additionla fix for upcoming OSX on ARM64 related to #2804, as suggested by fxcoudert in #2805
2020-09-01 19:54:08 +02:00
Martin Kroeker
68b1713c30 Merge pull request #2811 from martin-frbg/issue2806
Make NO_AVX512 option override the AVX512 compile test in CMAKE builds as well
2020-09-01 17:19:14 +02:00
Martin Kroeker
4074770d00 Merge pull request #2797 from martin-frbg/relafixes1
ReLAPACK fixes
2020-09-01 16:04:03 +02:00
Martin Kroeker
b87a77da02 Merge pull request #79 from xianyi/develop
rebase
2020-09-01 12:03:53 +02:00
Martin Kroeker
f42e84d46c Fix misnaming of LAPACK_?ggsvp function prototypes as LAPACKE_ (#2808)
* Fix misnaming of LAPACK_?ggsvp and ?ggsvd function prototypes as LAPACKE_

* Drop the LAPACKE matrix_layout parameter from the argument lists, change ints to pointers and add missing work arguments.
2020-09-01 10:44:48 +02:00
Martin Kroeker
0a4c5c4c44 Merge pull request #2807 from martin-frbg/issue2804
Work around ARMV8 build-time cpu detection problems on non-Linux systems
2020-08-31 23:44:56 +02:00
Martin Kroeker
3210a42734 Report cpu as ARMV8 instead of just giving up on non-Linux hosts 2020-08-31 20:03:21 +02:00
Martin Kroeker
5feb087c05 Handle Apple labeling armv8 as arm64 rather than aarch64 2020-08-31 20:02:08 +02:00
Gengxin Xie
448152cdd8 define __AVX2__ to ensure the haswell code compiled with avx2 2020-08-31 14:39:08 +08:00
Gengxin Xie
cb3c190a3a Implementaion of dasum, sasum with AVX2 & AVX512 intrinsic 2020-08-31 11:44:08 +08:00
Martin Kroeker
59e01b1aec Merge pull request #2799 from RajalakshmiSR/p10_ger
POWER10: Avoid setting accumulators to zero in gemm kernels
2020-08-28 22:52:11 +02:00
Rajalakshmi Srinivasaraghavan
317ff27cda POWER10: Avoid setting accumulators to zero in gemm kernels
For the first iteration, it is better to use xvf*ger instead of xvf*gerpp
builtins which helps to avoid setting accumulators to zero. This helps
to reduce few instructions.
2020-08-28 10:42:54 -05:00
Martin Kroeker
514a3d7d63 Merge pull request #2798 from kadler/aix-cpuid
Fix compile error on AIX cpuid detection
2020-08-28 08:30:59 +02:00
Kevin Adler
085aae8bdb Fix compile error on AIX cpuid detection
In 589c74a the cpuid detection was changed to use systemcfg, but a copy
and paste error was introduced during some refactoring that caused
POWER7 detection to reference CPUTYPE_POWER7 (which doesn't exist)
instead of CPUTYPE_POWER6.
2020-08-27 23:10:45 -05:00
Martin Kroeker
de63675717 Add early returns and fix sign errors in workspace calculations 2020-08-27 11:25:18 +02:00
Martin Kroeker
d64cc2be81 Add early returns 2020-08-27 11:22:50 +02:00
Martin Kroeker
c9b67141f0 Add early returns 2020-08-27 11:20:31 +02:00
Martin Kroeker
6797a3a1e0 Add early returns 2020-08-27 11:15:12 +02:00
Martin Kroeker
936966a42c Make ILAENV and xGETRF2 functions available 2020-08-27 10:59:08 +02:00
Martin Kroeker
5c6c2cd4f6 Merge pull request #2775 from Guobing-Chen/Fix_OMP_threads_specify
Fix OMP num specify issue
2020-08-24 20:18:09 +02:00
Martin Kroeker
e54be4ba1c Merge pull request #2792 from pkubaj/patch-1
Add aliases for armv6, armv7
2020-08-24 08:03:39 +02:00
pkubaj
48a1364e10 Add aliases for armv6, armv7
FreeBSD uses those names for 32-bit ARM variants.
2020-08-23 18:50:19 +00:00
Chen, Guobing
0c1c903f1e Fix OMP num specify issue
In current code, no matter what number of threads specified, all
available CPU count is used when invoking OMP, which leads to very bad
performance if the workload is small while all available CPUs are big.
Lots of time are wasted on inter-thread sync. Fix this issue by really
using the number specified by the variable 'num' from calling API.

Signed-off-by: Chen, Guobing <guobing.chen@intel.com>
2020-08-24 02:45:54 +08:00
Martin Kroeker
a073fa870e Merge pull request #2791 from martin-frbg/issue2787
Fix crashes in parallelized x86_64 ZDOT particularly on Windows
2020-08-23 19:33:03 +02:00
Martin Kroeker
b2053239fc Fix mssing dummy parameter (imag part of alpha) of zdot_thread_function 2020-08-23 15:08:16 +02:00
Martin Kroeker
b11bb6e728 Merge pull request #2790 from martin-frbg/issue2789
Add OpenMP dependency to pkgconfig information if needed
2020-08-23 14:42:35 +02:00
Martin Kroeker
1840bc5b52 Add OpenMP dependency to pkgconfig file if needed 2020-08-22 13:55:18 +02:00
Martin Kroeker
7c0977c267 Add OpenMP dependency to pkgconfig file if needed 2020-08-22 13:53:44 +02:00
Martin Kroeker
fb3d80c42a Merge pull request #78 from xianyi/develop
rebase
2020-08-22 13:52:29 +02:00
Martin Kroeker
9ee21a0a39 Merge pull request #2780 from Guobing-Chen/CPL_build_support
Enable COOPERLAKE build target
2020-08-20 19:54:29 +02:00
Martin Kroeker
bd3207b4b4 Update system.cmake 2020-08-19 22:51:10 +02:00
Martin Kroeker
b8ebfc9335 Update system.cmake 2020-08-19 22:30:19 +02:00
Martin Kroeker
7c1986640b fallback from cooperlake to skylake if gcc<10 2020-08-19 20:48:39 +02:00
Martin Kroeker
71d33c952d Typo fix 2020-08-19 17:44:23 +02:00
Martin Kroeker
6a3c074786 -march=cooperlake requires gcc10 2020-08-19 17:22:12 +02:00
Martin Kroeker
430f741b30 -march=cooperlake requires gcc10 2020-08-19 17:17:53 +02:00
Martin Kroeker
6f4dc7445d Fix typo 2020-08-19 16:36:55 +02:00
Martin Kroeker
81fbe8d088 -march=cooperlake only available in gcc >= 10 2020-08-19 16:10:15 +02:00
Martin Kroeker
bb9cf766f5 make march=cooperlake option conditional on gcc >= 10.1 2020-08-19 15:06:30 +02:00
Martin Kroeker
75eeb265d7 [WIP] Refactor the driver code for direct SGEMM (#2782)
Move "direct SGEMM" functionality out of the SkylakeX SGEMM kernel and make it available
(on x86_64 targets only for now) in DYNAMIC_ARCH builds
* Add  sgemm_direct targets in the kernel Makefile.L3 and CMakeLists.txt
* Add direct_sgemm functions to the gotoblas struct in common_param.h
* Move sgemm_direct_performant helper to separate file
* Update gemm.c  to macros for sgemm_direct to support dynamic_arch naming via common_s,h
* (Conditionally) add sgemm_direct functions in setparam-ref.c
2020-08-19 14:51:09 +02:00
Martin Kroeker
2c72972570 Merge pull request #2785 from albertziegenhagel/always-generate-pkg-config
Do not require pkg-config to generate the *.pc file
2020-08-19 14:42:58 +02:00
Albert Ziegenhagel
6b731d917f Do not require pkg-config to generate the *.pc file
Generating the pkg-config file does not actually depend on pkg-config being available.
2020-08-18 08:48:48 +02:00
Martin Kroeker
5dcf47cd97 Merge pull request #2784 from martin-frbg/issue2783
Add fallback typedef for bfloat16 to openblas_config.h template
2020-08-17 19:06:13 +02:00
Martin Kroeker
aa286e301b Add typedef for bfloat16 if needed 2020-08-17 15:32:14 +02:00
Martin Kroeker
9f0ef9cdfc Merge pull request #77 from xianyi/develop
rebase
2020-08-17 15:28:15 +02:00
Martin Kroeker
6bfc66663c revert 2020-08-17 15:20:41 +02:00
Martin Kroeker
a8c6fb9e1c revert 2020-08-17 15:20:16 +02:00
Martin Kroeker
5ec8f716cf revert 2020-08-17 15:19:40 +02:00
Martin Kroeker
82f8a0aeba Update .drone.yml 2020-08-15 15:46:18 +02:00
Martin Kroeker
d57d503c15 Update Makefile 2020-08-15 14:46:26 +02:00
Martin Kroeker
37ac23e8a3 Add simple MT sgemm precision test and INTERFACE64 build 2020-08-15 13:38:05 +02:00
Martin Kroeker
6a93e3b2ba Add simple sgemm preicsion test 2020-08-15 13:33:52 +02:00
Martin Kroeker
47ce1dd08f Update gemm64.cpp 2020-08-15 13:31:28 +02:00
Martin Kroeker
f5fcc5baec Add trivial gemm test for multithread consistency 2020-08-15 13:30:29 +02:00
Martin Kroeker
597010a968 Fix incorrect argument to SLASET
Reference-LAPACK issue 425 (and 318)
2020-08-14 00:41:56 +02:00
Martin Kroeker
d64f1ef26b Fix incorrect argument to SLASET
Reference-LAPACK issue 425 (and 318)
2020-08-14 00:40:24 +02:00
Martin Kroeker
c62aad62e5 Fix incorrect calls to DLASET
Reference-LAPACK issue 429
2020-08-14 00:35:45 +02:00
Chen, Guobing
e740c4873d Enable COOPERLAKE build target
Enable new build target platform -- COOPERLAKE. This target platform
supports all the SKYLAKEX supported ISAs + avx512bf16. So all the
SKYLAKEX specific kernels/drivers and related code are now extended
to be also active on COOPERLAKE. Besides, new BF16 related kernels
are active under this target.
2020-08-13 06:18:00 +08:00
Martin Kroeker
efdd237a91 Add a dedicated POWER9 build to the Travis CI (#2774)
* Add dedicated POWER9 build (using new syntax to ensure it runs as a P9-only containerized job rather than a VM that
might end up on P8 hardware half of the time)
* Bump gcc version for POWER9 build
2020-08-12 23:08:38 +02:00
Martin Kroeker
4573cb2f43 Merge pull request #2765 from martin-frbg/issue2760
Add memory barrier to the PPC blas_lock implementation for Linux
2020-08-11 22:40:17 +02:00
Martin Kroeker
2a4bb797db Merge pull request #2773 from martin-frbg/issue2770
Fix Makefiles still mishandling NO_CBLAS=0 and NO_LAPACKE=0
2020-08-11 21:02:55 +02:00
Martin Kroeker
cbbe38bb88 Merge pull request #2772 from mhillenibm/s390x_gemm_tuning
s390x: GEMM tuning for z14
2020-08-11 18:14:09 +02:00
Martin Kroeker
619343278d Fix mishandling of NO_CBLAS=0 and NO_LAPACKE=0 2020-08-11 13:40:40 +02:00
Martin Kroeker
fee361ae64 fix another source of NO_CBLAS=0 surprise 2020-08-11 13:27:19 +02:00
Martin Kroeker
62f4c84f27 Merge pull request #76 from xianyi/develop
rebase
2020-08-11 13:25:12 +02:00
Marius Hillenbrand
e115c97e05 s390x/SGEMM: adjust default P and Q to multiples of M
We recently changed the register blocking for SGEMM on s390x to 16x4.
However, we did not adjust Q to a multiple of 16 and thus fell back to
the 8x4 kernel at each block's margin, without need. Adjust P and Q to
multiples of 16 to employ the faster 16x4 kernel for complete full-sized
blocks.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-08-11 12:56:46 +02:00
Marius Hillenbrand
07c334e7be s390x: Factor out small block sizes for SGEMM/DGEMM on z14
For small register blockings that are too small to fill up vector
registers with column vectors, we currently use a generic code block.
Replace that with instantiations of the generic code as individual
functions, so that the compiler can optimize each one separately.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-08-11 12:56:39 +02:00
Marius Hillenbrand
e2828e30aa s390x: Optimize SGEMM/DGEMM blocks for z14 with explicit loop unrolling/interleaving
Improve performance of SGEMM and DGEMM on z14 and z15 by unrolling and
interleaving the inner loop of the SGEMM 16x4 and DGEMM 8x4 blocks.
Specifically, we explicitly interleave vector register loads and
computation of two iterations.

Note that this change only adds one C function, since SGEMM 16x4 and
DGEMM 8x4 actually map to the same C code: they both hold intermediate
results in a 4x4 grid of vector registers, and the C implementation is
built around that.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-08-11 12:55:42 +02:00
Martin Kroeker
7219c9cb87 Merge pull request #2764 from martin-frbg/lapacktests
Fix array overruns in the LIN part of the LAPACK testsuite
2020-08-10 13:27:51 +02:00
Martin Kroeker
c9d32674ea Add memory barrier to the blas_lock implementation for Linux
as recommended by cparrott73 in #2760
2020-08-09 19:17:04 +02:00
Martin Kroeker
64259d521a Fix use of unallocated array in workspace query and wrong type of argument to xSCAL 2020-08-09 13:02:27 +02:00
Martin Kroeker
6f5ca44c1a Expand TAU array as SGEMQR/DGEMQR read elements 2 and 3 2020-08-09 12:59:20 +02:00
Martin Kroeker
d28b3f2776 Create Jenkinsfile for OSUOSL PowerCI 2020-08-08 18:05:20 +02:00
Martin Kroeker
ba3f7b3acf Merge pull request #2761 from RajalakshmiSR/Makefile_err
Remove extra symbol in Makefile
2020-08-08 12:20:04 +02:00
Rajalakshmi Srinivasaraghavan
475b5c95b9 Remove extra symbol in Makefile
While trying out different unroll values, noted that
make failed due to this extra symbol.
2020-08-07 15:27:44 -05:00
Martin Kroeker
cd60080d4a Merge pull request #2758 from martin-frbg/undef_shift
Fix GCC ubsan warnings in x86_64 complex dot and gemv_t kernels
2020-08-03 23:30:26 +02:00
Martin Kroeker
4847bfdddd Merge pull request #2757 from martin-frbg/cmake64
Fix lapack-tests linking to a suffixed libopenblas in cmake builds
2020-08-02 23:05:21 +02:00
Martin Kroeker
81dcfdcf39 Multiply by 2 instead of left-shifting a potentially negative number
fixes GCC ubsan warning in the BLAS tests
2020-08-02 18:29:56 +02:00
Martin Kroeker
0ef4b3f1f2 Multiply instead of doing a left shift of a potentially negative number
fixes GCC ubsan report in the BLAS tests
2020-08-02 18:27:40 +02:00
Martin Kroeker
aa53a8a5cb Multiply by two instead of left-shifting one place
fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests
2020-08-02 18:25:09 +02:00
Martin Kroeker
aa3a1e7d8c Multiply by two rather than left shift by one place
fixes GCC ubsan report of "left shift of negative value -2" in the BLAS tests
2020-08-02 18:22:31 +02:00
Martin Kroeker
aaf1a17168 Apply current library name suffix 2020-08-02 17:58:33 +02:00
Martin Kroeker
53add6a80d Apply library name suffix to openblas if any 2020-08-02 17:57:12 +02:00
Martin Kroeker
9eb897cc01 Merge pull request #75 from xianyi/develop
rebase
2020-08-02 17:50:06 +02:00
Martin Kroeker
7cead56258 Merge pull request #2753 from martin-frbg/issue2751
Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes
2020-08-02 15:32:46 +02:00
Martin Kroeker
6794ac3415 Add SYMBOLPREFIX and/or -SUFFIX to cblas.h if needed 2020-08-02 11:20:08 +02:00
Martin Kroeker
ecf4b9e0fc Improve substitution rules for SYMBOLPREFIX and -SUFFIX addition 2020-08-01 17:06:03 +02:00
Martin Kroeker
dfe5d09641 Merge pull request #2756 from martin-frbg/issue2755
Protect against inadvertent activation of USE_CUDA
2020-08-01 15:19:02 +02:00
Martin Kroeker
60cd5e55fc Protect against inadvertent activation of USE_CUDA 2020-08-01 12:31:39 +02:00
Martin Kroeker
da9e2a7ada Add SYMBOLPREFIX and/or SYMBOLSUFFIX to cblas prototypes 2020-07-31 16:03:33 +02:00
Martin Kroeker
c88cbc5e0d Merge pull request #2752 from kadler/cpuid_aix
Use systemcfg APIs for CPU detection on AIX
2020-07-31 12:52:24 +02:00
Kevin Adler
589c74aed3 Use systemcfg APIs for CPU detection on AIX
AIX libc already provides ready access to an integer that contains a bit
identifying the CPU it's running on, so there's no need to call a
program and grep its output. Additionally, prtconf is not available in
the PASE runtime, which provides an AIX emulation layer on the IBM i
operating system.

The AIX systemcfg.h also provides macro definitions like POWER_8,
POWER_9, etc for all the bits defining the CPUs as well as macros like
__power_8(), __power_9_andup() that return booleans, but I did not use
them. Since these macros depend on the level of the OS in which it is
built, they may not be defined and instead the associated hex literals
are used directly.
2020-07-30 21:06:40 -05:00
Martin Kroeker
104aa678b0 Fix inadvertent version number reversal to 0.3.9.dev caused by #2710 2020-07-30 11:40:52 +02:00
Martin Kroeker
c6b48e0394 Merge pull request #2749 from martin-frbg/make_ppc
Reorganize OpenMP build options for POWER and allow compiling for POWER9 with old gcc
2020-07-30 11:35:53 +02:00
Martin Kroeker
4927251298 Merge pull request #2750 from RajalakshmiSR/dgemv_p10
dgemv optimization for POWER10
2020-07-30 10:13:19 +02:00
Rajalakshmi Srinivasaraghavan
f77b6a83f4 dgemv optimization for POWER10
Making use of new vector pair POWER10 instructions in dgemv_n and dgemv_t.
Also adding a new block 4x128 to make use of Matrix-Multiply Assist (MMA)
feature introduced in POWER ISA v3.1.  Tested on simulator and there
are no new test failures.
2020-07-29 18:59:32 -05:00
Martin Kroeker
39724e8128 Separate OpenMP handling and allow compilation of Power9 code with older gcc 2020-07-30 01:14:08 +02:00
Martin Kroeker
525db5401c Merge pull request #74 from xianyi/develop
rebase
2020-07-30 01:04:09 +02:00
Martin Kroeker
cb097beba2 Merge pull request #2741 from martin-frbg/issue2739
Adjust A53 SGEMM parameters to reflect recent switch to 8x8 kernel
2020-07-29 10:01:14 +02:00
Martin Kroeker
7c02f4b1f7 Merge pull request #2744 from martin-frbg/issue2738
Add AMD Renoir/Matisse cpu autodetection and preliminary support for Zen3
2020-07-28 19:32:04 +02:00
Martin Kroeker
383262035d Merge pull request #2740 from RajalakshmiSR/clang-power
Fix compilation issues with clang on POWER
2020-07-28 18:15:25 +02:00
Martin Kroeker
5fa581c87e Put hint to use git develop rather than master branch in README 2020-07-28 14:22:41 +00:00
Martin Kroeker
12918358aa Add AMD Renoir/Matisse and preliminary support for Zen3 as Zen2
also support AMD family 22 Jaguar/Puma as Bobcat
2020-07-28 13:53:17 +00:00
Martin Kroeker
200f5c44cc Add AMD Renoir models and preliminary support for ZEN3 as ZEN2
also remap erroneous family 16 entry to BOBCAT and reclaim erroneous family 25 "Barcelona" for Zen3
2020-07-28 13:45:23 +00:00
Martin Kroeker
64e2e4aaf3 missing braces 2020-07-27 20:19:22 +00:00
Martin Kroeker
921ec4e9e2 Adjust A53 SGEMM parameters to reflect move to 8x8 kernel 2020-07-27 19:54:46 +00:00
Rajalakshmi Srinivasaraghavan
d557584b71 Fix compilation issues with clang on POWER
As gcc defaults to -malign-power, removing that option. Also
adding -fno-integrated-as to use GNU assembler for powerpc
assembly optimization files. Fixed other compilation errors
reported in dgemv_t.c file.
2020-07-27 14:11:07 -05:00
Martin Kroeker
a4ceb1ade9 Merge pull request #2737 from ashwinyes/add_thunderx3_target
ARM64: Add THUNDERX3T110 Target
2020-07-27 15:19:47 +02:00
Ashwin Sekhar T K
4e1be0e481 ARM64: Add THUNDERX3T110 Target 2020-07-26 23:32:24 -07:00
Martin Kroeker
49b83e00b7 Merge pull request #2735 from martin-frbg/move_potrf
Move potrf_parallel.c from lapack/getrf to lapack/potrf where it belongs
2020-07-26 19:54:11 +02:00
Martin Kroeker
769ed9ffad Merge pull request #2734 from RajalakshmiSR/p10_fix
Fix to store results in correct order for POWER10 GEMM kernels
2020-07-25 09:02:32 +02:00
Martin Kroeker
f194ad59e1 Use _Atomic instead of volatile where available (file moved from ../getrf)
must have misplaced this in ../getrf when I made that change in March 2018 (40160ff)
the only changes since then were 
RFC : Add half precision gemm for bfloat16 in OpenBLAS Rajalakshmi Srinivasaraghavan
Rajalakshmi Srinivasaraghavan committed on 14 Apr 2020 as 7ebbb50

    Change _STDC_VERSION__ to __STDC_VERSION__ 
Zhiyong Dang committed on 11 May 2018 as 3716267
2020-07-25 08:52:24 +02:00
Martin Kroeker
4fda217f99 Delete potrf_parallel.c (moving it to ../potrf) 2020-07-25 06:42:39 +00:00
Rajalakshmi Srinivasaraghavan
9be2688c78 Fix to store results in correct order for POWER10 GEMM kernels
There is a recent compiler change in __builtin_mma_disassemble_acc() which
affects the order of storing result in POWER10. Also removing new LDFLAG
-mno-power10-stub as it is handled by linker automatically.
2020-07-24 23:08:11 -05:00
Martin Kroeker
6a2a60038c Merge pull request #2720 from martin-frbg/issue2694
WIP Further fixes for 32bit POWER8
2020-07-24 23:19:45 +02:00
Martin Kroeker
251a09ec90 Typo fix 2020-07-24 16:04:58 +00:00
Martin Kroeker
95d37e1575 Regroup the 32 and 64bit sections and restore 64bit CAXPY 2020-07-24 10:13:46 +00:00
Martin Kroeker
3523bb778e Merge pull request #2721 from martin-frbg/p8align
Fix alignment errors in the power8 saxpy kernel
2020-07-24 11:06:20 +02:00
Martin Kroeker
a50d0e29c8 Merge pull request #2731 from martin-frbg/pgippc
Fixes for compilation on POWER with PGI compilers
2020-07-24 11:05:16 +02:00
Martin Kroeker
bf1f0734ff Use OPENBLAS_MAKE_COMPLEX_FLOAT on PPC only 2020-07-23 20:40:13 +00:00
Martin Kroeker
ca3561cab9 Add ifdefs around call to altivec microkernel 2020-07-23 18:30:42 +00:00
Martin Kroeker
21072e502a Typo fix 2020-07-23 17:34:56 +00:00
Martin Kroeker
7c6e56b5df Rewrite assignment to complex for better portability 2020-07-23 17:10:59 +02:00
Martin Kroeker
661c6bfa5a Exclude altivec code paths if the compiler does not support them 2020-07-23 17:08:20 +02:00
Martin Kroeker
9796e552ea Avoid undefining NAME,CNAME etc for pgcc as it makes it ignore the new defininitions 2020-07-23 17:03:28 +02:00
Martin Kroeker
d6b6e5ccd7 Merge pull request #73 from xianyi/develop
rebase
2020-07-23 16:59:06 +02:00
Martin Kroeker
349b722d8d Merge pull request #2729 from martin-frbg/issue2728
Unify BUFFER_SIZE settings for x86_64 again to fix DYNAMIC_ARCH crashes
2020-07-22 22:45:57 +02:00
Martin Kroeker
6c33764ca4 Unify BUFFER_SIZE settings for x86_64 again to fix potentially fatal mismatch in DYNAMIC_ARCH builds 2020-07-22 17:30:55 +00:00
Martin Kroeker
d1b9613fd4 Merge pull request #2727 from wyphan/develop
Patch for building on POWERPC with PGI compilers (was Patch for building on Summit)
2020-07-21 17:06:53 +02:00
Martin Kroeker
3cfc74b1a0 Merge pull request #2726 from martin-frbg/2725-2
Add detection of stdatomic.h for cmake
2020-07-21 16:42:06 +02:00
Wileam Phan
9ae154ba89 Patch for building on Summit 2020-07-20 23:30:28 -04:00
Martin Kroeker
9e21a100e3 Add trivial check for stdatomic.h 2020-07-20 22:52:09 +00:00
Martin Kroeker
31d30312dc Merge pull request #72 from xianyi/develop
rebase
2020-07-21 00:49:12 +02:00
Martin Kroeker
fcfb7ffafb Merge pull request #2725 from martin-frbg/ccheck_c11
Have c_check probe availability of C11 atomics support and stdatomic.h
2020-07-18 23:08:08 +02:00
Martin Kroeker
bbe119ee3b Update conditional for atomics to use HAVE_C11 2020-07-18 17:19:59 +00:00
Martin Kroeker
f4f74941bd Update conditional for atomics to use HAVE_C11 2020-07-18 17:14:50 +00:00
Martin Kroeker
a36eb19ae0 Update conditional for C11 atomics to use HAVE_C11 2020-07-18 17:13:24 +00:00
Martin Kroeker
ce45af8151 Update conditional for atomics to use HAVE_C11 2020-07-18 17:09:56 +00:00
Martin Kroeker
6f38de06d2 Update conditional for atomics to use HAVE_C11 2020-07-18 17:09:01 +00:00
Martin Kroeker
09eb9d2584 Update conditional for atomics to HAVE_C11 2020-07-18 17:07:38 +00:00
Martin Kroeker
791e046744 Update conditional for atomics to use HAVE_C11 2020-07-18 17:05:59 +00:00
Martin Kroeker
94bab9d1f9 Update conditional for atomics to use HAVE_C11 2020-07-18 17:03:31 +00:00
Martin Kroeker
97d6eb97b1 Report availability of C11 support 2020-07-18 16:59:33 +00:00
Martin Kroeker
4afd11dae5 Add a check for C11 atomics and stdatomic.h 2020-07-18 16:57:41 +00:00
Martin Kroeker
72ec6280c7 Merge pull request #2724 from martin-frbg/loongsonreadme
Update cross-compiling example in README to reflect change in Loongson gcc
2020-07-18 18:08:40 +02:00
Martin Kroeker
26b7f24d16 Update cross-compiling example to reflect change in Loongson gcc
for #2723
2020-07-18 12:51:37 +00:00
Martin Kroeker
0db4218fed Merge pull request #2722 from martin-frbg/cmakefcheck
Handle lack of fortran compiler more gracefully in cmake
2020-07-17 10:33:03 +02:00
Martin Kroeker
9d000ecaa2 include CheckLanguage module 2020-07-16 22:36:35 +00:00
Martin Kroeker
a847d00366 handle missing lack of fortran compiler more gracefully 2020-07-16 22:17:39 +00:00
Martin Kroeker
0033f8be0d Use vec_vsx_ld/st to fix misaligned accesses flagged by asan 2020-07-16 23:32:54 +02:00
Martin Kroeker
f308e741b2 remove debug output and revert changes to cdot and crot 2020-07-15 10:00:07 +02:00
Martin Kroeker
4f5d26bb02 Merge pull request #2716 from RajalakshmiSR/p10_ldflag
Add new linker option for POWER10
2020-07-15 01:20:54 +02:00
Rajalakshmi Srinivasaraghavan
417c4e8af8 Add new linker option for POWER10
While building with DYNAMIC_ARCH on POWER9 with POWER10
aware toolchain, new LDFLAG is needed to avoid POWER10
instructions on PLT calls .
2020-07-14 11:54:04 -05:00
Martin Kroeker
da17abec87 fix trailing whitespace 2020-07-14 18:20:03 +02:00
Martin Kroeker
f8c2697701 Use POWER6 GEMM, TRMM and DTRSM on 32bit POWER8 2020-07-14 18:11:19 +02:00
Martin Kroeker
b144423f0f Do not define USE_TRMM for 32bit POWER8 2020-07-14 18:10:12 +02:00
Martin Kroeker
bd2498c886 Use POWER6 GEMM parameters on 32bit POWER8 2020-07-14 18:07:58 +02:00
Martin Kroeker
d8e2edfc20 Merge pull request #71 from xianyi/develop
rebase
2020-07-14 18:01:34 +02:00
Martin Kroeker
419b8686d1 Merge pull request #2682 from martin-frbg/aix
[WIP] fix compilation on AIX
2020-07-13 14:43:24 +02:00
Martin Kroeker
3ab15ff34c Merge pull request #2651 from leezu/actionsflang
Add flang build to Github Actions
2020-07-13 13:00:39 +02:00
Martin Kroeker
8916c4ae2c Merge branch 'develop' into actionsflang 2020-07-12 20:37:29 +02:00
Martin Kroeker
4fa283de66 Merge pull request #2706 from jussienko/use-always-omp-threads
Fix OpenMP builds defaulting to singlethreading with OMP_PLACES or OMP_PROC_BIND set
2020-07-12 20:17:11 +02:00
Martin Kroeker
5865c7d4d6 Make 32bit POWER8 use POWER6 kernels for now 2020-07-12 18:59:01 +02:00
Martin Kroeker
ae3a90f78f merge overwritten part of power10 support 2020-07-12 18:51:58 +02:00
Martin Kroeker
009864edde Merge pull request #2710 from martin-frbg/cmake-lapacktest
Add LAPACK-TESTING to the cmake build
2020-07-10 12:06:50 +02:00
Martin Kroeker
3de80b3f5a Merge pull request #2713 from RajalakshmiSR/p10-gcc10
Change minimum gcc version for POWER10
2020-07-10 10:43:33 +02:00
Rajalakshmi Srinivasaraghavan
af1e140e35 Change minimum gcc version for POWER10
As the MMA patches for POWER10 are backported to gcc10.2, changing
the minimum gcc version needed to build OpenBLAS for POWER10.
2020-07-09 21:46:06 -05:00
Martin Kroeker
d4a0299e16 Do not build lapack-test on MSVC for now (same as with BLAS test) 2020-07-09 13:57:27 +02:00
Martin Kroeker
f766024749 enable fortran for cmake 2020-07-09 13:44:25 +02:00
Martin Kroeker
c502760bef Modify for building with OpenBLAS 2020-07-09 13:13:16 +02:00
Martin Kroeker
29b5887d5f Modify for building with OpenBLAS 2020-07-09 13:12:35 +02:00
Martin Kroeker
60188a8c82 Append crude hack for enabling lapack tests in the OpenBLAS build 2020-07-09 11:44:31 +02:00
Martin Kroeker
1d63631afe Add lapack-test 2020-07-09 11:42:02 +02:00
Martin Kroeker
e82bb953a7 Merge pull request #2708 from RajalakshmiSR/p10_future
Changing mcpu option as power10
2020-07-08 12:26:44 +02:00
Martin Kroeker
ed7e155c35 Merge branch 'develop' into aix 2020-07-07 18:52:06 +02:00
Rajalakshmi Srinivasaraghavan
45d819ca82 Changing mcpu option as power10
As compiler enabled mcpu option as power10, changing it from future.
2020-07-07 11:25:20 -05:00
Martin Kroeker
8751a69271 Obtain actual cpu count on AIX and suppress spurious NO_AVX512 on non-x86 2020-07-07 15:46:32 +02:00
Jussi Enkovaara
10a2923f64 fixes #2238
Always obey omp_get_max_threads() when build with USE_OPENMP
2020-07-07 13:35:43 +03:00
Martin Kroeker
5ff83a4261 Merge pull request #2670 from mhillenibm/dumpfullversion_on_gcc7
RFC: Use -dumpfullversion to get minor version on gcc-7 and newer
2020-07-07 00:12:28 +02:00
Martin Kroeker
5bc9680a86 Merge pull request #2703 from martin-frbg/issue2702
Compatibility fix for gcc < 4.7
2020-07-02 22:32:51 +02:00
Martin Kroeker
4ab3651591 Option -mavx2 requires at least gcc 4.7 2020-07-02 17:00:15 +02:00
Martin Kroeker
a83680b40b Merge pull request #69 from xianyi/develop
rebase
2020-07-02 16:56:00 +02:00
Martin Kroeker
c3aa036e99 Merge pull request #2693 from EGuesnet/AIX-build-on-POWER8-32bits
AIX build on POWER8 32bits
2020-07-01 08:29:52 +02:00
EGuesnet
634e1305f9 Update cgemm_kernel_8x4_power8.S 2020-06-30 15:16:39 +02:00
Martin Kroeker
c467516132 Merge pull request #2688 from martin-frbg/cometlake
Add autodetection of Intel Comet Lake H and S models
2020-06-27 17:47:24 +02:00
Martin Kroeker
83f4746825 Add support for Comet Lake H and S 2020-06-27 14:41:24 +02:00
Martin Kroeker
584ef8d4ae Add support for Comet Lake H & S 2020-06-27 14:36:37 +02:00
Martin Kroeker
8dfda02e89 Merge pull request #68 from xianyi/develop
rebase
2020-06-27 14:29:29 +02:00
Martin Kroeker
28d69e0097 Merge pull request #2687 from martin-frbg/utfbom
Strip UTF8 byte order marker from source files
2020-06-26 22:53:09 +02:00
Martin Kroeker
c2467c9619 Merge pull request #2686 from RajalakshmiSR/p10_shgemm
powerpc: Optimized SHGEMM kernel for POWER10
2020-06-26 22:52:45 +02:00
Martin Kroeker
f86e749df4 Merge pull request #2683 from mtreinish/add-comet-lake-support
Add cpu detection support for comet lake U
2020-06-26 12:11:03 +02:00
Martin Kroeker
d199c2787d Merge pull request #2680 from kavanabhat/aix_makefile_fix
Fix for #2671
2020-06-26 11:27:28 +02:00
Martin Kroeker
e30ad0e521 Strip UTF8 byte order marker from source 2020-06-26 09:00:43 +02:00
Rajalakshmi Srinivasaraghavan
d23419accc powerpc: Optimized SHGEMM kernel for POWER10
This patch introduces new optimized version of SHGEMM kernel
using power10 Matrix-Multiply Assist (MMA) feature introduced in
POWER ISA v3.1. This patch makes use of new POWER10 compute instructions
for matrix multiplication operation.

Tested on simulator and there are no new test failures.
2020-06-25 22:19:08 -05:00
Matthew Treinish
2f9c10810c Also set CPUTYPE in get_cpuname() 2020-06-25 15:53:56 -04:00
Matthew Treinish
f37e941d52 Add support to driver/others/dynamic.c too 2020-06-25 11:56:49 -04:00
Matthew Treinish
2a91452bdd Add cpu detection support for comet lake U
Comet Lake U CPUs have family: 6, model: 6, extended family: 0, and
extended model: 10 were not being correctly detected by GETARCH during
openblas builds and would show CORE=UNKNOWN and LIBCORE=unknown. This
commit adds the necessary information to cpuid_x86 to detect extended
family 10 model 6 and return the proper core information. It's
essentially just a skylake cpu, not skylake x, so I just took the used
the same return fields as skylake.
2020-06-25 11:32:09 -04:00
Martin Kroeker
c854ef5471 Fix variable names in conditional 2020-06-25 13:29:52 +02:00
Martin Kroeker
c0afc11742 Fix POWERPC builds on AIX (gcc/gfortran 7)
1. macro preprocessing for POWER8 and later kernels only
2. default buffer size used by AIX version of m4 is too small
2020-06-25 13:12:36 +02:00
Martin Kroeker
c592f0f80a Fix utest build on AIX 2020-06-25 12:58:13 +02:00
Martin Kroeker
3f613b1301 Tentative changes for building on AIX 2020-06-25 12:57:00 +02:00
Martin Kroeker
72a0ec8e75 Fix reading of CPU name from prtconf output on AIX 2020-06-25 12:55:10 +02:00
Martin Kroeker
3446e58daf Fix handling of uname output on AIX 2020-06-25 12:31:35 +02:00
Martin Kroeker
4ca8becc4b Merge pull request #67 from xianyi/develop
rebase
2020-06-25 10:33:03 +02:00
Martin Kroeker
dfe819f3bd Merge pull request #2679 from RajalakshmiSR/P10_GEMM
POWER10 Optimized GEMM kernels
2020-06-25 08:31:38 +02:00
Martin Kroeker
4369e52555 Merge pull request #2677 from brada4/develop
address vs2019 C4293 warning
2020-06-25 08:31:17 +02:00
Gordon Fossum
bb2f52844b powerpc: Optimized ZGEMM kernel for POWER10
This patch introduces new optimized version of ZGEMM kernel
using power10 Matrix-Multiply Assist (MMA) feature introduced in
POWER ISA v3.1. This patch makes use of new POWER10 compute instructions
for matrix multiplication operation.

Tested on simulator and there are no new test failures.
Cycles count reduced by 30-50%  compared to POWER9 version depending on
M/N/K sizes.
2020-06-24 14:50:12 -05:00
Rajalakshmi Srinivasaraghavan
571eadb880 powerpc: Optimized SGEMM/DGEMM/CGEMM for POWER10
This patch introduces new optimized version of SGEMM, CGEMM and DGEMM
using power10 Matrix-Multiply Assist (MMA) feature introduced in
POWER ISA v3.1. This patch makes use of new POWER10 compute instructions
for matrix multiplication operation.

Tested on simulator and there are no new test failures.
Cycles count reduced by 30-50%  compared to POWER9 version depending on
M/N/K sizes.
MMA GCC patch for reference:
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8ee2640bfdc62f835ec9740278f948034bc7d9f1
2020-06-24 14:48:15 -05:00
Kavana Bhat
df4ade070f Fix for #2671 2020-06-24 04:25:47 -05:00
User User-User
e6b9275034 address vs2019 C4293 2020-06-24 09:12:23 +03:00
Martin Kroeker
53ea5bfece Merge pull request #66 from xianyi/develop
rebase
2020-06-23 10:13:44 +02:00
Martin Kroeker
93592d1260 Merge pull request #2675 from wjc404/develop
AVX512 DGEMM TCOPY_16 Function
2020-06-23 09:29:02 +02:00
Martin Kroeker
6eaeb01263 Merge pull request #2658 from RajalakshmiSR/p10
powerpc: Add support for future processor
2020-06-23 00:02:37 +02:00
Martin Kroeker
45d542c9d1 Merge pull request #65 from xianyi/develop
rebase
2020-06-21 12:41:01 +02:00
wjc404
086d87a302 AVX512 dgemm tcopy_16 function 2020-06-20 00:07:43 +08:00
Martin Kroeker
af501eb753 Merge pull request #2669 from mhillenibm/zarch_fix_gcc_detection
Zarch fix gcc detection
2020-06-17 17:55:25 +02:00
Martin Kroeker
0eb6c4dded Merge pull request #2672 from mhillenibm/test_num_threads
cpp_thread_test: Change adjustment of concurrency on systems with <52 hw threads
2020-06-17 17:54:31 +02:00
Marius Hillenbrand
de838c38ef cpp_thread_test/dgemv: fail early if concurrency is zero
The two test cases dgemv_tester and dgemm_tester accept the degree of
concurrency as command line argument (amongst others). Fail early if
value 0 has been specified, instead of later with less-clear symptoms.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-06-17 16:15:44 +02:00
Marius Hillenbrand
478898b37a cpp_thread_test/dgemv: cap concurrency to number of hw threads on small systems
... instead of (number of hw threads - 4) to avoid invalid numbers on
smaller systems. Currently, systems with 4 or fewer CPUs (e.g., small CI
VMs) would fail the test. Fixes one of the issues discussed in #2668

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-06-17 16:08:48 +02:00
Marius Hillenbrand
cde4690721 RFC: Use gcc -dumpfullversion to get minor version with gcc-7.x
In gcc-7.1, the behavior of -dumpversion changed to be configured
at compile-time. On some distributions it only dumps the major version
(e.g., Ubuntu), so the current checks for the gcc minor version report
false negatives. As a replacement, gcc-7.1 introduced -dumpfullversion
which always prints the full version.

Update the gcc version detection in Makefile.system to employ
-dumpfullversion with gcc-7 and newer.

Posting this patch for discussion, since it emerged from discussions
around issue #2668 and PR #2669. It is not solving a problem right now,
but may be useful in the future.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-06-16 15:45:59 +02:00
Marius Hillenbrand
2389291766 Makefile.system: remove duplicate variable GCCVERSIONGT5
... to bring unified gcc version detection with common variables to the
one remaining spot in Makefile.system.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-06-16 15:06:03 +02:00
Marius Hillenbrand
a2d13ea611 Fix gcc version detection for zarch
Employ common variables for gcc version detection and fix the broken
check for gcc >= 5.2.
Fixes #2668

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-06-16 15:06:03 +02:00
Martin Kroeker
1bd3cd66c2 Increment version to 0.3.10.dev 2020-06-14 22:05:19 +02:00
Martin Kroeker
1c53e1366d Increment version to 0.3.10.dev 2020-06-14 22:04:37 +02:00
Martin Kroeker
63b03efc2a Merge pull request #2667 from xianyi/develop
Merge develop into 0.3.0 for 0.3.10 release
2020-06-14 22:03:04 +02:00
Martin Kroeker
95dbeff66d Merge branch 'release-0.3.0' into develop 2020-06-14 22:02:45 +02:00
Martin Kroeker
3b673a24b7 Increment version to 0.3.10.dev 2020-06-14 21:57:52 +02:00
Martin Kroeker
1eb1979050 Increment version to 0.3.10.dev 2020-06-14 21:57:15 +02:00
Martin Kroeker
efc53b6e7e Merge pull request #2665 from martin-frbg/flang-fixes-2a
Fix spelling of flang option -Mrecursive, add -Kieee and workaround for AOCC optimizer bug
2020-06-14 21:56:08 +02:00
Martin Kroeker
72888497e2 Update with 0.3.10 changes 2020-06-14 21:55:31 +02:00
Martin Kroeker
7e3e006af6 Merge pull request #2666 from martin-frbg/blastest
Update BLAS tests to what netlib 3.9.0 uses
2020-06-14 18:28:37 +02:00
Martin Kroeker
d906d14402 Merge pull request #2664 from ACSimon33/exported_symbols
Add missing exported symbols.
2020-06-14 18:27:03 +02:00
Martin Kroeker
3785c0e82b Merge pull request #2663 from martin-frbg/issue2654
Respect predefined defaults for AR, AS, LD and RANLIB
2020-06-14 18:26:43 +02:00
Martin Kroeker
f2d8879af6 Merge pull request #2661 from martin-frbg/issue2660
Report selected DYNAMIC_ARCH kernel rather than one of its aliases in gotoblas_corename
2020-06-14 18:25:37 +02:00
Martin Kroeker
6876221cf3 Remove optimization level limit for flang again and add -fno-unroll-loops for AOCC flang 2.x instead 2020-06-14 17:40:24 +02:00
Martin Kroeker
79cdcde717 Re-enable higher optimization levels for flang while disabling loop unrolling for AOCC flang 2020-06-14 17:18:16 +02:00
Martin Kroeker
18a11137f1 Update BLAS tests to correspond to Reference-LAPACK 3.9.0
replaces calculation of machine precision with call to epsilon intrinsic and removes the requirement for previous output files to be removed before rerunning tests
2020-06-14 10:26:25 +02:00
Martin Kroeker
1dd712131e Fix spelling of flang option -Mrecursive and add -Kieee 2020-06-14 00:09:31 +02:00
Martin Kroeker
0ed2adf0b2 Fix spelling of flang option -Mrecursive and add -Kieee 2020-06-14 00:01:20 +02:00
Martin Kroeker
abf670757b Respect predefined defaults for AR, AS, LD and RANLIB 2020-06-13 23:21:13 +02:00
Simon Märtens
41fc6f3cd2 Added missing exported symbols. 2020-06-13 22:37:39 +02:00
Martin Kroeker
007d9f97d7 Make gotoblas_corename report the name of the selected TARGET rather than its aliases 2020-06-13 19:25:28 +02:00
Martin Kroeker
63d26090f5 Merge pull request #64 from xianyi/develop
rebase
2020-06-13 19:14:47 +02:00
Rajalakshmi Srinivasaraghavan
9fe930f205 powerpc: Add support for future processor
This is the initial patch to support build infrastructure
for POWER10 architecture.
2020-06-11 15:47:20 -05:00
Martin Kroeker
3a1b58d54a Merge pull request #2653 from craft-zhang/cortex-a53
fix INIT8x4 of SGEMM on Arm Cortex-A53
2020-06-10 12:19:33 +02:00
Martin Kroeker
f7659be4a0 Merge pull request #2652 from martin-frbg/flang-fixes
Fixes for compilation with flang binary release 20190329
2020-06-09 20:31:06 +02:00
ZhangDanfeng
bc6fd20a40 fix INIT8x4
Signed-off-by: ZhangDanfeng <467688405@qq.com>
2020-06-10 01:01:16 +08:00
Martin Kroeker
3ce469a34f Limit optimization level to O1 for flang and add -frecursive 2020-06-09 16:11:13 +02:00
Martin Kroeker
ba2c5b404d When building with flang, use it also for the final link step to get dependencies right 2020-06-09 16:09:34 +02:00
Martin Kroeker
f07a80354b Apply previously AOCC-specific workaround to all versions of flang 2020-06-09 16:07:03 +02:00
Martin Kroeker
fdd1b50263 Merge pull request #63 from xianyi/develop
rebase
2020-06-09 15:54:30 +02:00
Leonard Lausen
b98923f33a Test enforce -O1 for flang 2020-06-09 06:54:47 +00:00
Leonard Lausen
4cb1db0e3b Test flang build 2020-06-09 06:31:17 +00:00
Martin Kroeker
430e8b45fe Merge pull request #2648 from martin-frbg/lapack411
Break out of potentially infinite rescaling loop in LAPACK xLARGV/xLARTG/xLARTGP
2020-06-07 19:45:52 +02:00
Martin Kroeker
88fe85f4e0 Merge pull request #2647 from martin-frbg/aocc-flang
Small fixes for flang in general and the AMD AOCC version of it in particular
2020-06-07 19:45:11 +02:00
Martin Kroeker
89091e6b64 Merge pull request #2645 from martin-frbg/misc_fixes
Miscellaneous fixes
2020-06-07 19:44:50 +02:00
Martin Kroeker
522aaf53bf Break out of potentially infinite rescaling loop in LAPACK xLARGV/xLARTG/xLARTGP
Reference-LAPACK issue 411
2020-06-07 14:30:20 +02:00
Martin Kroeker
c3574ffe53 Merge pull request #2646 from wjc404/develop
Optimize AVX512 parallel DGEMM performance
2020-06-07 13:18:22 +02:00
Martin Kroeker
4e28dc6353 Use only -O1 with AMD AOCC version of flang
to prevent miscompilation of LAPACK codes and tests on Ryzen
2020-06-07 00:05:02 +02:00
Martin Kroeker
13c28889a2 Update "cosmetic fixes for non-C99 compilers" 2020-06-06 15:22:27 +02:00
wjc404
0e3ac4a06b Add files via upload 2020-06-06 14:56:57 +08:00
Martin Kroeker
28915eed72 Cosmetic fixes for non-C99 compilers 2020-06-05 10:05:34 +02:00
Martin Kroeker
7f60fb6b91 Delete spurious copy of common_param.h 2020-06-05 10:04:16 +02:00
Martin Kroeker
0464e662ad make blas_quickdivide unsigned and guard against miscompilation 2020-06-05 10:03:36 +02:00
Martin Kroeker
0f9a935a5a Merge pull request #62 from xianyi/develop
rebase
2020-06-05 09:51:06 +02:00
Martin Kroeker
79cd69fea4 Merge pull request #2644 from martin-frbg/cmake-maxstack
Add CMAKE support for MAX_STACK_ALLOC setting
2020-06-05 08:33:48 +02:00
Martin Kroeker
bb12c2c854 Limit MAX_STACK_ALLOC availability to non-Wndows 2020-06-04 19:07:27 +02:00
Martin Kroeker
32c1c1e125 Update azure-pipelines.yml 2020-06-04 19:03:46 +02:00
Martin Kroeker
f1953b8b81 Update azure-pipelines.yml 2020-06-04 17:58:13 +02:00
Martin Kroeker
6e97df7b47 Add CMAKE support for MAX_STACK_ALLOC setting 2020-06-04 14:45:31 +02:00
Martin Kroeker
729303e5ed Merge pull request #2643 from craft-zhang/cortex-a53
Improve performance of SGEMM on Arm Cortex-A53
2020-06-04 07:58:45 +02:00
Martin Kroeker
547965530f Merge pull request #2638 from leezu/actions
Add Github Actions test for DYNAMIC_ARCH builds on Linux and macOS
2020-06-04 00:02:37 +02:00
ZhangDanfeng
9b7877ccf1 sgemm copy source init
Signed-off-by: ZhangDanfeng <467688405@qq.com>
2020-06-04 02:10:45 +08:00
ZhangDanfeng
f82fa802d1 Insert prefetch
Signed-off-by: ZhangDanfeng <467688405@qq.com>
2020-06-04 02:08:48 +08:00
Martin Kroeker
3eda3d34c3 Merge pull request #2641 from martin-frbg/ppcg4
Work around PPC G4 test failures
2020-06-03 16:43:46 +02:00
Martin Kroeker
a8f42ae85c set cmake build type to Release 2020-06-03 15:28:59 +02:00
Martin Kroeker
e6e2e531bc revert clang pragma 2020-06-03 15:16:27 +02:00
Martin Kroeker
456dc04441 Update sgemm_kernel_16x4_skylakex_3.c 2020-06-03 15:15:41 +02:00
Martin Kroeker
89323458a9 preset optimization level for apple clang 2020-06-03 15:07:25 +02:00
Martin Kroeker
e153bdeb70 Update dynamic_arch.yml 2020-06-03 13:46:43 +02:00
Martin Kroeker
c2001f7756 Make cmake build verbose to see options in use 2020-06-03 12:18:15 +02:00
Martin Kroeker
c2b3f0b3f6 Revert "keep Apple Clang from optimizing this" 2020-06-03 10:22:15 +02:00
Martin Kroeker
f16e39554d Change PPCG4 CGEMM_M to match kernel change 2020-06-03 09:15:29 +02:00
Martin Kroeker
b1ee81228a Change complex DOT and ROT to generic kernels and switch CGEMM
in response to test failures seen in #2628 and BLAS-Tester
2020-06-03 09:13:29 +02:00
Martin Kroeker
9f7358d7dc Keep Apple Clang from optimizing this 2020-06-03 08:52:53 +02:00
Martin Kroeker
54fa90fb25 Keep apple clang 11.0.3 from trying to optimize this (and running out of registers) 2020-06-02 17:31:45 +02:00
Leonard Lausen
5a709b8340 Print CPU info in output 2020-06-01 20:51:11 +00:00
Leonard Lausen
b31a68b835 Add Github Actions test for DYNAMIC_ARCH builds 2020-06-01 20:16:59 +00:00
Martin Kroeker
86552bf4c7 Update f_check 2020-05-31 15:22:12 +02:00
Martin Kroeker
a349d48d89 Merge pull request #2636 from martin-frbg/issue2634
Fix CMAKE build Issues on OS X
2020-05-31 15:16:09 +02:00
Martin Kroeker
4db00121dc Disable EXPRECISION and add -lm on OSX (same as the BSDs and Linux) 2020-05-31 12:39:36 +02:00
Martin Kroeker
909897f13b Document option USE_LOCKING 2020-05-31 12:37:57 +02:00
Martin Kroeker
e79245acd9 Merge pull request #2635 from ilayn/patch-1
BUG: Fix the loop range in ZHEEQUB.f
2020-05-30 14:37:12 +02:00
Ilhan Polat
76d2612e0c BUG: Fix the loop range in ZHEEQUB.f 2020-05-30 14:11:11 +02:00
Martin Kroeker
ced49466f0 Use the fortran compiler to link LAPACK-related benchmarks
to fix linking problems with (at least) the AMD version of flang that creates dependencies on more than just the fortran runtime.
2020-05-29 13:35:51 +02:00
Martin Kroeker
6e270f91ec add support for RETURN_BY_STACK semantics, e.g. clang 2020-05-29 13:29:10 +02:00
Martin Kroeker
200296b0f4 remove libomp from link list only for pgfortran
at least the AMD (aocc) flavor of flang wants to link to a (real or dummy) libomp by default
2020-05-29 13:23:51 +02:00
Martin Kroeker
dd7a650792 Merge pull request #59 from xianyi/develop
rebase
2020-05-29 13:06:25 +02:00
Martin Kroeker
4a4c50a7ce Merge pull request #2627 from pkubaj/patch-1
Add powerpc (32-bit)
2020-05-26 08:36:24 +02:00
Martin Kroeker
d069780e63 Merge pull request #2626 from docularxu/working-gcc-version-detections
make GCC version detection OS-independent
2020-05-26 08:35:58 +02:00
pkubaj
33c8790603 Add powerpc (32-bit)
Only powerpc64 is present.
2020-05-25 13:14:09 +02:00
Guodong Xu
06387ac0e6 make GCC version detection OS-independent
Previous design put GCC version detection inside of OSNAME 'WINNT'.
However, such detections are required for 'Linux' and possibly other
OS'es as well. For example, there is usage of the GCC versions
in Makefile.arm64. When compiling on Linux machine, in the previous
design, Markfile.arm64 will not know the correct GCC version.

The fix is to move GCC version detection into common part, not
wrapped by anything.

Signed-off-by: Guodong Xu <guodong.xu@linaro.com>
2020-05-25 10:57:03 +00:00
Martin Kroeker
f1a18d245b Merge pull request #2618 from craft-zhang/cortex-A53
Improve performance of SGEMM and STRMM on Arm Cortex-A53
2020-05-25 12:14:46 +02:00
张丹枫
2a3aa91354 update CONTRIBUTORS.md, adding myself 2020-05-20 22:35:26 +08:00
张丹枫
ea5bdc3f72 split cortex-a53 param to match 8x8 kernel 2020-05-20 22:34:47 +08:00
张丹枫
9df79ae9a3 update sgemm and strmm kernel selecting strategy 2020-05-20 22:26:58 +08:00
张丹枫
a1fc6041cd use general register to speedup 2020-05-20 22:26:58 +08:00
张丹枫
edb423d772 align general register using to strmm_kernel_8x8 2020-05-20 22:26:58 +08:00
zhangdanfeng
0e6eb8c247 sgemm kernel use sgemm_kernel_8x8_cortexa53
Signed-off-by: zhangdanfeng <zhangdanfeng@cloudwalk.cn>
2020-05-20 22:26:58 +08:00
zhangdanfeng
d475db29c6 optimized for cortex-a53
Signed-off-by: zhangdanfeng <zhangdanfeng@cloudwalk.cn>
2020-05-20 22:26:58 +08:00
Martin Kroeker
729ac6bd4a Merge pull request #2623 from mhillenibm/zarch_dgemm_z14
s390x: Use new sgemm kernel also for DGEMM and DTRMM on Z14 (+ small cleanup)
2020-05-20 14:51:04 +02:00
Marius Hillenbrand
89fe17f20e s390x: Use new sgemm kernel also for DGEMM and DTRMM on Z14
Apply our new GEMM kernel implementation, written in C with vector intrinsics,
also for DGEMM and DTRMM on Z14 and newer (i.e., architectures with FP32 SIMD
instructions). As a result, we gain around 10% in performance on z15, in
addition to improving maintainability.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-20 10:23:35 +02:00
Marius Hillenbrand
bdd795ed03 s390x/GEMM: replace 0-init with peeled first iteration
... since it gains another ~2% of SGEMM and DGEMM performance on z15;
also, the code just called for that cleanup.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-20 10:23:35 +02:00
Martin Kroeker
e1038ea836 Merge pull request #2622 from martin-frbg/issue2619
Improve declaration of LAPACKE_get_nancheck
2020-05-19 23:07:22 +02:00
Martin Kroeker
6baa9a778d Improve declaration of LAPACKE_get_nancheck 2020-05-19 17:59:31 +02:00
Martin Kroeker
cf46c9f84e Merge pull request #2617 from martin-frbg/issue2616
Add workaround for unhandled  gmake jobserver flags in c_check/f_check
2020-05-18 13:23:58 +02:00
Martin Kroeker
55602fce56 Ignore spurious all-numeric library names derived from mishandled jobserver flags 2020-05-17 15:28:14 +02:00
Martin Kroeker
3d5e159e7a Ignore spurious all-numeric library names derived from mishandled jobserver flags 2020-05-17 15:26:57 +02:00
Martin Kroeker
2931feb575 Merge pull request #58 from xianyi/develop
rebase
2020-05-17 15:23:32 +02:00
Martin Kroeker
20245ded5f Merge pull request #2615 from mhillenibm/z14_alignment_hints
s390x: improvise vector alignment hints for older compilers
2020-05-14 21:06:34 +02:00
Marius Hillenbrand
2840432e49 s390x: improvise vector alignment hints for older compilers
Introduce inline assembly so that we can employ vector loads with
alignment hints on older compilers (pre gcc-9), since these are still
used in distributions such as RHEL 8 and Ubuntu 18.04 LTS.

Informing the hardware about alignment can speed up vector loads. For
that purpose, we can encode hints about 8-byte or 16-byte alignment of
the memory operand into the opcodes. gcc-9 and newer automatically emit
such hints, where applicable. Add a bit of inline assembly that achieves
the same for older compilers. Since an older binutils may not know about
the additional operand for the hints, we explicitly encode the opcode in
hex.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-14 15:36:03 +02:00
Martin Kroeker
ea78106c71 Merge pull request #2614 from mhillenibm/gemm_vec_z14
s390x: Improve performance of SGEMM and STRMM on z14 and newer
2020-05-13 15:09:23 +02:00
Marius Hillenbrand
cb9dc36dd5 Update CONTRIBUTORS.md
Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 16:14:00 +02:00
Marius Hillenbrand
1b0b4349a1 s390x/Z14: Change register blocking for SGEMM to 16x4
Change register blocking for SGEMM (and STRMM) on z14 from 8x4 to 16x4
by adjusting SGEMM_DEFAULT_UNROLL_M and choosing the appropriate copy
implementations. Actually make KERNEL.Z14 more flexible, so that the
change in param.h suffices. As a result, performance for SGEMM improves
by around 30% on z15.

On z14, FP SIMD instructions can operate on float-sized scalars in
vector registers, while z13 could do that for double-sized scalars only.
Thus, we can double the amount of elements of C that are held in
registers in an SGEMM kernel.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 15:59:51 +02:00
Marius Hillenbrand
71b6eaf459 s390x: Use new sgemm kernel also for strmm on Z14 and newer
Employ the newly added GEMM kernel also for STRMM on Z14. The
implementation in C with vector intrinsics exploits FP32 SIMD operations
and thereby gains performance over the existing assembly code. Extend
the implementation for handling triangular matrix multiplication,
accordingly. As added benefit, the more flexible C code enables us to
adjust register blocking in the subsequent commit.

Tested via make -C test / ctest / utest and by a couple of additional
unit tests that exercise blocking.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 15:59:51 +02:00
Marius Hillenbrand
43c0d4f312 s390x: Add vectorized sgemm kernel for Z14 and newer
Add a new GEMM kernel implementation to exploit the FP32 SIMD
operations introduced with z14 and employ it for SGEMM on z14 and newer
architectures.

The SIMD extensions introduced with z13 support operations on
double-sized scalars in vector registers. Thus, the existing SGEMM code
would extend floats to doubles before operating on them. z14 extended
SIMD support to operations on 32-bit floats. By employing these
instructions, we can operate on twice the number of scalars per
instruction (four floats in each vector registers) and avoid the
conversion operations.

The code is written in C with explicit vectorization. In experiments,
this kernel improves performance on z14 and z15 by around 2x over the
current implementation in assembly. The flexibilty of the C code paves
the way for adjustments in subsequent commits.

Tested via make -C test / ctest / utest and by a couple of additional
unit tests that exercise blocking (e.g., partial register blocks with
fewer than UNROLL_M rows and/or fewer than UNROLL_N columns).

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 15:59:51 +02:00
Marius Hillenbrand
d7c1677c20 Update CONTRIBUTORS.md, adding myself
Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 11:09:28 +02:00
Marius Hillenbrand
0dbe61a612 s390x: choose SIMD kernels at run-time based on OS and compiler support
Extend and simplify the run-time detection for dynamic architecture support for z
to check HW_CAP and only use SIMD features if advertised by the OS.
While at it, also honor the env variable LD_HWCAP_MASK and do not use
the CPU features masked there.

Note that we can only use the SIMD features on z13 or newer (i.e.,
Vector Facility or Vector-Enhancements Facilities) when the operating
system supports properly context-switching the vector registers. The OS
advertises that support as a bit in the HW_CAP value in the auxiliary
vector. While all recent Linux kernels have that support, we should
maintain compatibility with older versions that may still be in use.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 11:01:16 +02:00
Marius Hillenbrand
62cf391cbb s390x: only build kernels supported by gcc with dynamic arch support
When building with dynamic arch support, only build kernels for
architectures that are supported by the gcc we are building with.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 11:01:16 +02:00
Marius Hillenbrand
8c338616f9 s390x: gate dynamic arch detection on gcc version and add generic
When building OpenBLAS with DYNAMIC_ARCH=1 on s390x (aka zarch), make
sure to include support for systems without the facilities introduced
with z13 (i.e., zarch_generic). Adjust runtime detection to fallback to
that generic code when running on a unknown platform other than Z13
through Z15.

When detecting a Z13 or newer system, add a check for gcc support for
the architecture-specific features before selecting the respective
kernel. Fallback to Z13 or generic code, in case.

Signed-off-by: Marius Hillenbrand <mhillen@linux.ibm.com>
2020-05-12 11:01:16 +02:00
Martin Kroeker
f94c53ec0a Merge pull request #2612 from RajalakshmiSR/testshgemm
Improve shgemm test
2020-05-12 08:34:02 +02:00
Rajalakshmi Srinivasaraghavan
8efba9b7c0 Improve shgemm test
This patch adds another check to test shgemm results.
2020-05-11 17:15:10 -05:00
Martin Kroeker
4fffa556d8 Merge pull request #2611 from RajalakshmiSR/bench_half
Include shgemm in benchtest
2020-05-11 21:08:41 +02:00
Rajalakshmi Srinivasaraghavan
ce90e2bd3f Include shgemm in benchtest
This patch is to enable benchtest for half precision gemm
when BUILD_HALF is set during make.
2020-05-11 09:57:46 -05:00
Martin Kroeker
948b6712ba Merge pull request #2610 from martin-frbg/issue2552-3
Temporary workaround for excessive LAPACK test failures with COMPLEX on Skylake-X
2020-05-10 13:10:31 +02:00
Martin Kroeker
2271c3506b Work around excessive LAPACK test failures on Skylake-X
Something in the plain C parts of x86_64 cscal.c and zscal.c appears to be miscompiled by both gfortran9 and ifort when compiling for skylakex-avx512, even when the optimized Haswell microkernel is not in use.
2020-05-09 23:49:18 +02:00
Martin Kroeker
db00b21445 Merge pull request #2609 from martin-frbg/issue2552-2
Correct ifort options
2020-05-09 21:33:02 +02:00
Martin Kroeker
58d26b4448 Correct ifort options
to same as suggested by reference-lapack
2020-05-09 17:15:36 +02:00
Martin Kroeker
8e47d14053 Merge pull request #2608 from martin-frbg/issue2604
Handle trailing whitespace and empty variables in KERNEL files
2020-05-09 16:36:14 +02:00
Martin Kroeker
cd10b35fe9 Handle trailing spaces and empty condition variables 2020-05-09 13:42:33 +02:00
Martin Kroeker
9472dd99cd Merge pull request #57 from xianyi/develop
rebase
2020-05-09 13:20:44 +02:00
Martin Kroeker
7181665452 Merge pull request #2605 from RajalakshmiSR/cmake-power
Fix cmake compilation issue - POWER9
2020-05-09 11:29:28 +02:00
Rajalakshmi Srinivasaraghavan
bd9ff820bc Fix cmake compilation issue - POWER9
This patch removes extra space in the sgemmotcopy filename
thereby allowing it to create entry in kernel/Makefile
created by cmake.
2020-05-08 20:31:56 -05:00
Martin Kroeker
63e45def70 Merge pull request #2603 from martin-frbg/issue2552
Add FFLAGS_DRV entry to the generated make.inc to fix lapack-test failure with Intel compilers
2020-05-08 22:08:39 +02:00
Martin Kroeker
ec0f228632 Add FFLAGS_DRV to the generated make.inc to fix lapack-test on x86_64 with icc/ifort
fixes #2552
2020-05-08 18:06:12 +02:00
Martin Kroeker
90e2941c61 Merge pull request #56 from xianyi/develop
rebase
2020-05-07 22:43:48 +02:00
Martin Kroeker
10d5f3c87b Merge pull request #2602 from ashwinyes/thunderx2_develop
DAXPY Optimizations for ThunderX2
2020-05-07 22:06:41 +02:00
Ashwin Sekhar T K
8353cb245a ARM64: Improve DAXPY for ThunderX2
Improve performance of DAXPY for ThunderX2
when the vector fits in L1 Cache.
2020-05-07 09:22:50 -07:00
Martin Kroeker
ec2dd7b875 Merge pull request #2601 from martin-frbg/issue818
Undefine NAME/CNAME etc  in Makefile.system before defining them
2020-05-07 10:12:33 +02:00
Martin Kroeker
4e82eb9f8a Undefine ASMNAME/NAME/CNAME before defining them
to avoid redefinition warning when environment variables like CFLAGS are being used (fixes #818)
2020-05-07 00:31:32 +02:00
Martin Kroeker
61300bb735 Merge pull request #55 from xianyi/develop
rebase
2020-05-07 00:27:14 +02:00
Martin Kroeker
33e9b12464 Merge pull request #2597 from martin-frbg/appleclang
Use Clang 9.0.0 miscompilation fix for corresponding AppleClang version as well
2020-05-05 13:55:08 +02:00
Martin Kroeker
90dba9f716 Duplicate earlier Clang 9.0.0 workaround for corresponding Apple Clang version
As discussed on the original PR #2329, the "Apple Clang 11.0.3" that appears to be based the same LLVM release produces the same miscompilation of this file.
2020-05-05 10:44:50 +02:00
Martin Kroeker
424d551e01 Merge pull request #53 from xianyi/develop
rebase
2020-05-01 15:18:46 +02:00
Martin Kroeker
596f5df9e8 Merge pull request #2591 from RajalakshmiSR/testhalf
Add test for shgemm
2020-05-01 09:59:39 +02:00
Martin Kroeker
5dd14e3d48 Make building the bfloat16 functions conditional on option BUILD_HALF (#2590)
* make building the bfloat16 BLAS functions conditional on BUILD_HALF

* pass the BUILD_HALF option to gensymbol

* Pass BUILD_HALF as a compiler define for dynamic_arch builds
2020-05-01 09:58:30 +02:00
Martin Kroeker
a54e35e780 Merge pull request #2586 from martin-frbg/miscfixes
Trivial fix for compiler warnings
2020-04-29 22:01:41 +02:00
Rajalakshmi Srinivasaraghavan
564b0d39ef Add test for shgemm
This patch has Makefile changes to add test for shgemm which
compares sgemm and shgemm result.
2020-04-29 13:40:34 -05:00
Martin Kroeker
5d58b11101 Merge pull request #52 from xianyi/develop
rebase
2020-04-29 14:36:15 +02:00
Martin Kroeker
d394d4e677 Merge pull request #2585 from martin-frbg/mips64fix
Increase default BUFFER_SIZE on MIPS64
2020-04-28 19:47:55 +02:00
Martin Kroeker
f4248af26e Fix compiler warnings 2020-04-28 10:43:12 +02:00
Martin Kroeker
2d89603e9d Increase BUFFER_SIZE on mips64 to match SGEMM parameters 2020-04-28 10:40:40 +02:00
Martin Kroeker
26bc15258a Merge pull request #51 from xianyi/develop
rebase
2020-04-28 10:38:50 +02:00
Martin Kroeker
141998dce2 Merge pull request #2584 from martin-frbg/issue2583
[WIP] Have CMAKE parse conditional lines in KERNEL files
2020-04-28 10:35:12 +02:00
Martin Kroeker
3bd56846bb Silence a debug message 2020-04-27 16:27:09 +02:00
Martin Kroeker
e7bbdfdf84 Have CMAKE parse conditional lines in KERNEL files
Supports ifeq and ifneq, but requires both to have an else branch
2020-04-27 15:20:03 +02:00
Martin Kroeker
b6795db731 Merge pull request #2582 from martin-frbg/mips32fix
Increase BUFFER_SIZE on MIPS32 to accomodate SGEMM requirements
2020-04-27 09:18:34 +02:00
Martin Kroeker
5e0dbf8dfe Increase default BUFFER_SIZE to accomodate SGEMM parameters
in response to compile-time warning from #2551
2020-04-26 22:21:05 +02:00
Martin Kroeker
955d73127f Merge pull request #50 from xianyi/develop
rebase
2020-04-26 22:17:56 +02:00
Martin Kroeker
a8c1bea7ae Merge pull request #2581 from martin-frbg/raji
Fix travis configuration and update CONTRIBUTORS.md
2020-04-25 19:57:10 +02:00
Martin Kroeker
e43b49e064 Drop the set -e from travis scripts 2020-04-25 16:18:54 +02:00
Martin Kroeker
3e28db7f38 Update CONTRIBUTORS.md 2020-04-25 13:51:44 +02:00
Martin Kroeker
4b69ee31af Merge pull request #2580 from martin-frbg/issue2538-3
Increase POWER8 ZGEMM_R and use same R values for POWER9
2020-04-25 00:28:18 +02:00
Martin Kroeker
03ff213c51 Increase POWER8 ZGEMM_R and use same R values for POWER9
fixes lapack-test zger failures seen in #2299 after application of my PR #2551
2020-04-24 21:46:54 +02:00
Martin Kroeker
299d1c8de0 Merge pull request #2578 from martin-frbg/issue2576
Quote getarch include paths in prebuild.cmake
2020-04-24 14:32:46 +02:00
Martin Kroeker
70869d571f Quote include paths for getarch to protect any embedded spaces 2020-04-24 10:30:44 +02:00
Martin Kroeker
cba87222b2 Merge pull request #49 from xianyi/develop
rebase
2020-04-24 10:21:48 +02:00
Martin Kroeker
f80dd2151e xcode 11.4.1 for homebrew ? 2020-04-23 14:31:09 +02:00
Martin Kroeker
4412ee1754 Switch homebrew build env to new xcode 11.4
default 11.3.1 in the github image is causing brew to fail with "outdated xcode" message
2020-04-23 10:54:46 +02:00
Martin Kroeker
f6104b68c1 Merge pull request #2571 from martin-frbg/issue2299
Work around IDAMAX/IZAMAX bugs on POWER8BE with ELFv2 FreeBSD
2020-04-22 18:27:13 +02:00
Martin Kroeker
84f2c71e93 Merge pull request #2573 from martin-frbg/issue2572
Enable cblas interfaces to GEMM3M in CMAKE builds
2020-04-22 15:04:49 +02:00
Martin Kroeker
06208c8d01 Limit this fix to ELFv2 builds 2020-04-22 14:16:40 +02:00
Martin Kroeker
c90b28dee6 Export ELF_VERSION for use in powerpc kernel configurations 2020-04-22 14:14:20 +02:00
Martin Kroeker
6275b43918 Avoid duplicate printout of byte order and report ELF_VERSION 2020-04-22 14:12:27 +02:00
Martin Kroeker
2db5178e2d enable cblas interfaces to GEMM3M in CMAKE builds 2020-04-22 11:01:28 +02:00
Martin Kroeker
57549f5c92 Merge pull request #2569 from martin-frbg/issue2472-2
Fix linker option passing for MSVS and ReLAPACK
2020-04-21 20:26:53 +02:00
Martin Kroeker
f5c4c28b98 Work around POWER8BE bugs on FreeBSD (ELFv2)
for #2299
2020-04-21 17:17:17 +02:00
Martin Kroeker
239282d5e2 Use CMAKE_SHARED_LINKER_FLAGS to pass MSVC linker option
target_link_libraries does not work here according to issue 2472
2020-04-20 22:30:51 +02:00
Martin Kroeker
568674477c Merge pull request #48 from xianyi/develop
rebase
2020-04-20 21:51:59 +02:00
Martin Kroeker
fa42588e1f Merge pull request #2565 from martin-frbg/mips24k
Support MIPS32 24K family as P5600
2020-04-20 17:13:53 +02:00
Martin Kroeker
8a6d26458b Merge pull request #2559 from RajalakshmiSR/shgemm
Add half precision gemm for bfloat16 in OpenBLAS
2020-04-19 22:09:55 +02:00
Martin Kroeker
db86f516b9 Merge pull request #2568 from martin-frbg/azure-win
Add a Windows/CL build job to the Azure CI
2020-04-19 19:06:33 +02:00
Martin Kroeker
aec353b5a7 Add a Windows/CL build to the Azure Ci configuration 2020-04-19 19:04:33 +02:00
Martin Kroeker
c62fbefad4 Merge pull request #2567 from xianyi/revert-2566-azurewin
Revert "Add Windows build job on Azure CI"
2020-04-19 19:01:58 +02:00
Martin Kroeker
04706e760d Revert "Add Windows build job on Azure CI (#2566)"
This reverts commit e1e543b145.
2020-04-19 19:00:37 +02:00
Martin Kroeker
e1e543b145 Add Windows build job on Azure CI (#2566)
* Add Windows-CL build job on Azure
2020-04-19 16:16:15 +02:00
Martin Kroeker
e55ec82bb9 Delete KERNEL.1004K 2020-04-19 15:44:30 +02:00
Martin Kroeker
7353ea5afc Delete KERNEL.24K 2020-04-19 15:44:19 +02:00
Martin Kroeker
6a04efb122 Rename KERNEL files to include MIPS prefix 2020-04-19 15:43:54 +02:00
Martin Kroeker
5afb66812f Update getarch.c 2020-04-19 14:55:31 +02:00
Martin Kroeker
0d18f231fc Update getarch.c 2020-04-19 13:52:58 +02:00
Martin Kroeker
2f4a8e5bc4 Rename the FORCE entries for 24K and 1004K to include the MIPS prefix 2020-04-19 13:22:19 +02:00
Martin Kroeker
4f70512b97 Update kernel.cmake 2020-04-19 08:10:26 +02:00
Martin Kroeker
8792fc4d5f Disable RPCC macro on MIPS24K 2020-04-19 07:21:48 +02:00
Martin Kroeker
577c5d9f8f Update README.md 2020-04-19 06:54:52 +02:00
Martin Kroeker
6721f2750e Update TargetList.txt 2020-04-19 06:51:57 +02:00
Martin Kroeker
b0b02a080d Add compiler options for MIPS32 24K/1004K 2020-04-19 06:50:51 +02:00
Martin Kroeker
a1fc98dc57 rename 1004K, 24K to MIPS1004K, MIPS24K to avoid identifier naming problem 2020-04-18 23:50:23 +02:00
Martin Kroeker
d0737b0142 Update kernel.cmake 2020-04-18 21:36:28 +02:00
Martin Kroeker
7dbb59b256 Update common_macro.h 2020-04-18 21:34:14 +02:00
Martin Kroeker
00172d440b Typo fix in MIPS24K addition 2020-04-18 21:16:49 +02:00
Martin Kroeker
d712ea724c Add MIPS24K support 2020-04-18 21:10:18 +02:00
Martin Kroeker
61bbae3ac1 Handle MIPS24K like P5600
and allow enforcing TARGET=1004K as well (omission from earlier 1004K merge and later introduction of TARGET check)
2020-04-18 21:09:32 +02:00
Martin Kroeker
1c1ca2bc0a Merge pull request #47 from xianyi/develop
rebase
2020-04-18 21:07:14 +02:00
Martin Kroeker
c7d668c248 Update common_macro.h 2020-04-18 16:04:38 +02:00
Martin Kroeker
a83a59b038 Use generic kernels for ishama,shasum,shdot,shrot 2020-04-18 15:53:51 +02:00
Martin Kroeker
0a19bd813c Use generic codes for shamax and shcopy 2020-04-18 12:52:51 +02:00
Martin Kroeker
e7afe8a969 Define AXPBY_K fallback for float16 2020-04-18 11:10:15 +02:00
Martin Kroeker
f361de30a3 Use generic axpy.c for SHAXPY as x86 lacks saxpy.c 2020-04-18 11:07:16 +02:00
Martin Kroeker
9f6d6f6cb6 use saxpy.c instead of axpy.S for SHAXPY 2020-04-17 22:27:58 +02:00
Rajalakshmi Srinivasaraghavan
22bb50fb81 cmake fixes 2020-04-17 13:35:17 -05:00
Martin Kroeker
236a3d8ce6 Merge pull request #2563 from zelong-1024/develop
[OpenBLAS]: benchmark error of potrf
2020-04-16 11:45:32 +02:00
l00536773
6b7ef6543a [OpenBLAS]: benchmark error of potrf
[description]: when the matrix size goes higher than 5800 during the cpotrf test, error info, such as "Potrf info = 5679", will be returned on ARM64 and x86 machines. Uplo = L & F.
[solution]: changed the func for building the matrix so that the complex Hermitian matrix can stay positive definite during the computation.
[dts]:
2020-04-16 10:55:10 +08:00
Rajalakshmi Srinivasaraghavan
67cc4b9e16 Fix warnings in clang and export symbol 2020-04-15 19:15:23 -05:00
Martin Kroeker
250e6f8039 Merge pull request #2557 from martin-frbg/dronebadge
Update and reformat README
2020-04-15 20:23:43 +02:00
Martin Kroeker
7a6d0016b0 Merge pull request #2556 from martin-frbg/epicdrone
Add a drone.io multithread test for x86_64
2020-04-15 20:23:17 +02:00
Martin Kroeker
e8e8a6e608 Restore USE_OPENMP in the x86 thread test 2020-04-15 19:26:12 +02:00
Martin Kroeker
579811fb6a Move all 19.04-based jobs back to ubuntu 18.04 2020-04-15 17:38:33 +02:00
Rajalakshmi Srinivasaraghavan
a87793e03c Fix DYNAMIC_ARCH compilation errors 2020-04-15 09:09:50 -05:00
Rajalakshmi Srinivasaraghavan
ac6a22ae78 Update header 2020-04-14 22:58:39 -05:00
Rajalakshmi Srinivasaraghavan
ff010f496e Build shgemm for all architecture 2020-04-14 20:38:53 -05:00
Rajalakshmi Srinivasaraghavan
7eb55504b1 RFC : Add half precision gemm for bfloat16 in OpenBLAS
This patch adds support for bfloat16 data type matrix multiplication kernel.
For architectures that don't support bfloat16, it is defined as unsigned short
(2 bytes).  Default unroll sizes can be changed as per architecture as done for
SGEMM and for now 8 and 4 are used for M and N.  Size of ncopy/tcopy can be
changed as per architecture requirement and for now, size 2 is used.

Added shgemm in kernel/power/KERNEL.POWER9 and tested in powerpc64le and
powerpc64.  For reference, added a small test compare_sgemm_shgemm.c to compare
sgemm and shgemm output.

This patch does not cover OpenBLAS test, benchmark and lapack tests for shgemm.
Complex type implementation can be discussed and added once this is approved.
2020-04-14 14:55:08 -05:00
Martin Kroeker
84a9614345 try x86_64 test without openmp 2020-04-14 19:18:35 +02:00
Martin Kroeker
b969533703 Add drone.io badge, mention EMAG8180 support, reformat the DYNAMIC_ARCH paragraph 2020-04-14 10:53:28 +02:00
Martin Kroeker
0f08f3efa6 Add a multithread test for x86_64 2020-04-13 22:46:12 +02:00
Martin Kroeker
c861b2a7bd Merge pull request #2553 from martin-frbg/issue2444
Add a read memory barrier to the traversal of the buffer slot list
2020-04-13 21:28:59 +02:00
Martin Kroeker
cf62adffbb Merge pull request #2555 from martin-frbg/issue1137
Handle unaligned data in the SSE2 copy kernel
2020-04-13 18:29:56 +02:00
Martin Kroeker
3eec7d382c ARMV7 does not support DMB ISHLD, use DMB ISH 2020-04-13 15:56:31 +02:00
Martin Kroeker
5b0093b5fe Convert aligned moves to unaligned
should have no performance impact on reasonably modern cpus and fixes occasional crashes in actual user code.
2020-04-13 14:58:52 +02:00
Martin Kroeker
f41600e66f Add a read barrier in the traversing of the buffer list
Needed on systems with weak memory ordering - the inferior, partially working fix from #2544 was already removed in #2551
2020-04-13 12:34:02 +02:00
Martin Kroeker
f5efecb7ca Add (empty) read barrier definition 2020-04-13 12:24:10 +02:00
Martin Kroeker
a52bdd9d7b Add (empty) read barrier definition 2020-04-13 12:22:35 +02:00
Martin Kroeker
db3226a646 Add (empty) read barrier definition 2020-04-13 12:18:48 +02:00
Martin Kroeker
69b6e258d8 Add (empty) read barrier definition 2020-04-13 12:17:41 +02:00
Martin Kroeker
3d4db4d002 Add read barrier definition 2020-04-13 12:16:44 +02:00
Martin Kroeker
99dde1d2c9 Add read barrier definition 2020-04-13 12:14:58 +02:00
Martin Kroeker
ee6b3df02c Add read barrier definition 2020-04-13 12:14:06 +02:00
Martin Kroeker
25e879fe92 Add (empty) read barrier definition 2020-04-13 12:12:54 +02:00
Martin Kroeker
d237dc1360 Add read barrier definition 2020-04-13 12:11:58 +02:00
Martin Kroeker
8692456226 Add read barrier definition 2020-04-13 12:10:37 +02:00
Martin Kroeker
d1d69e1b9a Add read barrier definition 2020-04-13 12:09:24 +02:00
Martin Kroeker
20d0cb2f65 Merge pull request #46 from xianyi/develop
rebase
2020-04-13 12:06:40 +02:00
Martin Kroeker
e7f0da9295 Merge pull request #2551 from martin-frbg/issue2538-2
Increase BUFFER_SIZEs and add a safeguard; supply GEMM_R for POWER8/9
2020-04-12 22:34:41 +02:00
Martin Kroeker
e9bfa2291a Fix parameter overflow 2020-04-12 19:47:02 +02:00
Martin Kroeker
2a28448a96 Add safeguards for sufficient BUFFER_SIZE 2020-04-12 19:45:36 +02:00
Martin Kroeker
a33d177430 Increase default BUFFER_SIZE on ARM, ZARCH and newer x86_64, add GEMM_R for POWER8/9
As shown in #2538, default buffersizes on some platforms were smaller than required in memory.c
and the requirement could never be fulfilled for a calculated GEMM_R on PPC given the fomula used
2020-04-12 19:44:48 +02:00
Martin Kroeker
f73391c9c9 Merge pull request #45 from xianyi/develop
rebase
2020-04-12 19:39:05 +02:00
Martin Kroeker
7905383cb5 Merge pull request #2547 from sharvil/develop
Add API to set thread affinity on Linux.
2020-04-11 00:35:38 +02:00
Martin Kroeker
a8cbd451bf Merge pull request #2541 from bapt/develop
libname: treat FreeBSD and DragonFly like linux and sunos
2020-04-11 00:35:07 +02:00
Martin Kroeker
eecd8c3204 Merge pull request #2548 from gxw-loongson/develop
Add a GENERIC target for 64bit MIPS
2020-04-11 00:34:04 +02:00
Martin Kroeker
ea85eb2e02 Merge pull request #2549 from martin-frbg/fixthreadtest
Match thread count in cpp_thread_test to host capability
2020-04-10 23:54:40 +02:00
Martin Kroeker
66f89c0aaf Match thread count to machine capability 2020-04-10 22:06:44 +02:00
gxw
8d07cf9b67 Fix compilation problem on loongson platform
Using "make TARGET=GENERIC" on loongson platform will get the following
error messages:
"make[1]: *** No rule to make target 'sgemm_incopy.o', needed by 'libs'"
Add kernel/mips64/KERNEL.generic to slove the problem.
2020-04-09 19:28:15 +08:00
Sharvil Nanavati
7b4773b24d Add API to set thread affinity on Linux.
Issue: #2545
2020-04-08 12:49:35 -07:00
Martin Kroeker
69f277f8ee Add another memory barrier for ARM and a multicore test run on ThunderX to help detect such issues (#2544)
* Add another memory barrier in memory.c to prevent races in memory slot allocation

* Add an all-core test on Drone.io's ThunderX platform and modify dgemm_tester to use all 96 cores
2020-04-08 11:04:51 +02:00
Martin Kroeker
3a6d51c2fd Merge pull request #44 from xianyi/develop
Add a Z13 build to the Travis configuration (#2542)
2020-04-04 22:48:53 +02:00
Martin Kroeker
1c7771df96 Merge pull request #43 from martin-frbg/revert-42-z12ci
Revert 42 z12ci to keep forked develop clean
2020-04-04 22:46:58 +02:00
Martin Kroeker
a56c9ec52a Revert "Add IBM Z to Travis configuration (#42)"
This reverts commit 7972beb375.
2020-04-04 22:45:01 +02:00
Martin Kroeker
4ae6d1a01b Add a Z13 build to the Travis configuration (#2542)
* Add IBM Z to Travis configuration
2020-04-03 16:02:11 +02:00
Martin Kroeker
7972beb375 Add IBM Z to Travis configuration (#42)
* Add IBM Z to Travis configuration
2020-04-03 15:59:18 +02:00
Baptiste Daroussin
41e802443a libname: treat FreeBSD and DragonFly like linux and sunos
There is no difference in the way libnames are handle between FreeBSD
and linux or sunos. FreeBSD and DragonFly prefers having sonames as well
2020-04-03 06:20:42 +02:00
Martin Kroeker
7bd8624b79 Merge pull request #41 from xianyi/develop
rebase
2020-04-02 10:32:19 +02:00
Martin Kroeker
806f89166e Make ARMV7 compile with xcode and add a CI job for it (#2537)
* Add an ARMV7 iOS build on Travis

* thread_local appears to be unavailable on ARMV7 iOS

* Add no-thumb option for ARMV7 IOS build to get it to accept DMB ISH

* Make local labels in macros of nrm2_vfpv3.S compatible with the xcode assembler
2020-04-02 10:30:37 +02:00
Martin Kroeker
f059e614eb Merge pull request #2536 from martin-frbg/recurs
Add "recursive" option for LAPACK builds with ifort or pgfort as well
2020-04-01 20:00:13 +02:00
Martin Kroeker
e13b6773ee ifort and pgfort need "recursive" for safe compilation of LAPACK as well 2020-04-01 15:39:16 +02:00
Martin Kroeker
a05243d0f2 ifort and pgfort need "recursive" for compiling LAPACK as well
as shown in Reference-LAPACK issue 401 (their PR 403)
2020-04-01 15:38:07 +02:00
Martin Kroeker
c6af9bbb32 Merge pull request #2534 from martin-frbg/issue2496
Fix zero initialization for beta=0 case
2020-03-31 20:53:13 +02:00
Martin Kroeker
144be81ca1 fix initialization to zero in the NEON SGEMM_BETA kernel as well 2020-03-31 16:53:56 +02:00
Martin Kroeker
07cdd5d05c Fix zero initialization for beta=0 case
use immediate initialization instead of multiplication in case register content is a NaN
2020-03-31 00:21:02 +02:00
Martin Kroeker
567d2760e6 Merge pull request #2520 from wjc404/develop
Fix avx512 sgemm performance bug when ldc is a multiple of 1024
2020-03-30 20:15:59 +02:00
Martin Kroeker
018bb3e433 Merge pull request #2533 from martin-frbg/gemmdirect2
Use runtime check for AVX512 capability in DYNAMIC_ARCH builds made on SKX
2020-03-30 20:15:37 +02:00
Martin Kroeker
79fd006c58 Expose the support_avx512 function provided in dynamic.c 2020-03-26 21:25:39 +01:00
Martin Kroeker
8229c163b7 Use runtime check for AVX512 (sgemm_direct) capability when using DYNAMIC_ARCH 2020-03-26 21:12:56 +01:00
Martin Kroeker
a986d42ea6 Merge pull request #39 from xianyi/develop
rebase
2020-03-26 21:06:51 +01:00
Martin Kroeker
b6a948fbee Merge pull request #2530 from martin-frbg/dynmsg
Add message highlighting minimum target choice at end of DYNAMIC_ARCH…
2020-03-24 15:44:46 +01:00
Martin Kroeker
0cc352417e Merge pull request #2529 from shengyang-3390/dev1
add ctest for drotm and modified ctest for drot.
2020-03-24 15:44:27 +01:00
Martin Kroeker
fe47dc8673 Add message highlighting minimum target choice at end of DYNAMIC_ARCH builds
related to #2526
2020-03-23 19:35:51 +01:00
Martin Kroeker
9f67d03d3b Merge pull request #2527 from martin-frbg/gemmdirect
Avoid calling DIRECT codepath in DYNAMIC_ARCH on non-SKX
2020-03-23 12:47:19 +01:00
shengyang
50f4fb2fbd add ctest for drotm and modified ctest for drot.
make sure that test cases cover all code path when kernel uses looping unrolling.
2020-03-23 10:47:05 +08:00
Martin Kroeker
6a14b34c20 Avoid calling DIRECT codepath in DYNAMIC_ARCH on non-SKX 2020-03-22 14:33:16 +01:00
Martin Kroeker
8c7c1395da Merge pull request #2521 from martin-frbg/cm-avx512
Use proper extension on the avx512 testcase filename
2020-03-22 01:03:42 +01:00
Martin Kroeker
5f6f6a2c7d Merge pull request #2525 from andreas-schwab/develop
Fix ARCHCONFIG for Neoverse-N1
2020-03-21 18:47:48 +01:00
Andreas Schwab
71cf2acdef Fix ARCHCONFIG for Neoverse-N1
../config_kernel.h:24:9: warning: missing whitespace after the macro name
   24 | #define ARMV8-march armv8.2-a
      |         ^~~~~
2020-03-21 17:35:42 +01:00
Martin Kroeker
1d9773b800 Use proper extension on the avx512 testcase filename
The need to call it .tmp existed only when it was generated by a tmpfile call, and the "-x c" option to tell the compiler it is actually a C source is not universally supported (this broke the test with clang-cl at least)
2020-03-20 23:05:53 +01:00
Martin Kroeker
a46a8c4956 Merge pull request #2518 from shengyang-3390/dev
add ctest for srotm and modified ctest for srot.
2020-03-20 23:00:06 +01:00
Martin Kroeker
7ae737e04c Merge pull request #2519 from martin-frbg/issue2472
Fix cmake compilation with ifort on Windows
2020-03-20 22:57:44 +01:00
wjc404
64daad4365 Update param.h 2020-03-20 21:46:18 +00:00
wjc404
b8307768e2 Add files via upload 2020-03-21 05:42:10 +08:00
Martin Kroeker
6d54c94760 Make ifort on Windows create lowercase symbols with appended underscore
tentative fix for #2472
2020-03-20 01:08:10 +01:00
Martin Kroeker
c0da205412 Merge pull request #38 from xianyi/develop
rebase
2020-03-20 01:05:22 +01:00
shengyang
a06d78556d add ctest for srotm and modified ctest for srot.
make sure that test cases cover all code path when kernel uses looping unrolling.
2020-03-18 14:17:32 +08:00
Martin Kroeker
af8a619e1f Merge pull request #2517 from wjc404/develop
Temporary fix for SKX STRSM
2020-03-17 10:12:53 +01:00
wjc404
62b9608986 Update KERNEL.SKYLAKEX 2020-03-17 12:52:55 +08:00
Martin Kroeker
717c604aeb Merge pull request #2515 from zelong-1024/develop
[OpenBLAS]: benchmark for her/her2 LEVEL2 functions
2020-03-16 21:59:55 +01:00
Martin Kroeker
ce33da4cab Merge pull request #2513 from aaawuanjun/develop
[OpenBlas]: Add benchmark tpsv file and modify benchmark/Makefile
2020-03-16 21:58:55 +01:00
Martin Kroeker
a1b181cea2 Merge pull request #2516 from wjc404/develop
AVX2 STRSM kernels
2020-03-16 21:58:34 +01:00
wjc404
cdc0e9011e Update KERNEL.ZEN 2020-03-16 16:39:37 +00:00
wjc404
fa049d49c2 AVX2 STRSM kernel 2020-03-17 00:34:08 +08:00
l00536773
d45c53ecf1 [OpenBLAS]: benchmark for her/her2 LEVEL2 functions
[description]: benchmark for her/her2
[solution]: added benchmark for her/her2, modified makefile in benchmark
[dts]:
2020-03-16 11:19:05 +08:00
Martin Kroeker
c2840997db Merge pull request #2508 from liujingjue/develop
[OpenBLAS]:fix the iamax benchmark error
2020-03-14 14:21:30 +01:00
Martin Kroeker
c775458299 Merge pull request #2512 from martin-frbg/lapackh
Move declarations of lapack_complex_custom types outside the extern C
2020-03-14 13:27:40 +01:00
Martin Kroeker
c0649aa694 Merge pull request #2506 from xiaofengF/develop
Add benchmark for SPMV and fix segmentation fault when data size >= 50000
2020-03-14 13:08:36 +01:00
wuanjun 00447568
2428dc9fd3 [OpenBlas]: Add benchmark tpsv file and modify benchmark/Makefile
[Description]: Solve lack of tpsv benchmark.
2020-03-14 09:11:08 +08:00
Martin Kroeker
0fe0914437 Merge pull request #2505 from aaawuanjun/develop
[OpenBlas]:Add benchmark tpmv.c and modify benchmark/Makefile
2020-03-13 23:16:35 +01:00
Martin Kroeker
4cfd8a3f57 Merge pull request #2511 from martin-frbg/fixppctest
Prevent attempts to run ctest or test when fortran is not available
2020-03-13 23:04:01 +01:00
Martin Kroeker
ee2e758278 Move declarations of lapack_complex_custom types outside the extern C
fixes #2510
2020-03-13 20:34:13 +01:00
Martin Kroeker
2d8781b0dc Do not attempt to run test without fortran 2020-03-13 20:11:19 +01:00
Martin Kroeker
c436e8af7b Do not attempt to run ctest without fortran
The main Makefile takes care of this in the build process, but users or CI jobs may try to run this directly
2020-03-13 20:10:26 +01:00
l00546269
a0a3bf7c81 [OpenBLAS]:fix the iamax benchmark error
[Description]:the result for i?amax is not MFlops, it is MBytes
2020-03-13 10:58:39 +08:00
jayfely@qq.com
ae3f2c2e49 Remove cspmv and zspmv to remove the error occured in travis CI 2020-03-11 17:02:34 +08:00
jayfely@qq.com
83ecf9fea7 Modify Makefile in interface to remove the error occured in travis CI 2020-03-11 16:36:45 +08:00
jayfely@qq.com
649733ff15 Only keep spmv.goto and spmv.atlas 2020-03-11 15:48:58 +08:00
wuanjun 00447568
3e8f1c6cc5 [OpenBlas]:Add benchmark tpmv.c and modify Makefile
[Description]:Solve the problem of missing tpmv.c benchmark file
2020-03-11 12:31:48 +08:00
jayfely@qq.com
2f4c5bb3a9 Update spmv.c: solve segmentation fault when m and n are larger than 50000 2020-03-11 10:30:09 +08:00
Martin Kroeker
4e1c4e67d4 Merge pull request #2503 from martin-frbg/xerbl
Apply fix for LAPACK issue 394 (fixed-form code beyond column 72)
2020-03-10 23:38:07 +01:00
Martin Kroeker
b9a2a3c540 Merge pull request #2502 from martin-frbg/issue2497
Fix INTERFACE64 not propagating to the fortran codes on ARMV8
2020-03-10 20:01:23 +01:00
Martin Kroeker
047dfb216d Merge pull request #2501 from jijiwawa/Fix_mistakes
Fix  pr #2487 error
2020-03-10 16:44:40 +01:00
s00527847
cd8871f1a1 Use the correct unit of measure 2020-03-10 19:26:06 -04:00
Martin Kroeker
b25ae1fc60 Apply fix for Reference-LAPACK issue 394
reference to XERBLA extending beyond column 72, breaking builds with compilers that default to traditional punch card format
2020-03-10 13:37:41 +01:00
Martin Kroeker
3f7f7ab7e2 Restore INTERFACE64 for arm64 2020-03-10 12:51:07 +01:00
Martin Kroeker
9c22170f52 Merge pull request #37 from xianyi/develop
rebase
2020-03-10 12:49:21 +01:00
jayfely@qq.com
08e1d8cbae Modify Makefile in Benchmark 2020-03-10 14:32:18 +08:00
jayfely@qq.com
ff40a4e726 Add benchmark for SPMV 2020-03-10 14:22:18 +08:00
Zhang Xianyi
51019feae1 Merge pull request #2498 from njutcz/develop
Add benchmark for ?amax, ?max, ?amin, ?min, i?max, i?amin and i?min.
2020-03-09 16:04:33 +08:00
s00548429
bec7923a0d Fix the functional bugs for zamax. 2020-03-09 15:36:50 +08:00
s00548429
c5bdd21352 Add benchmark for ?amax, ?max, ?amin, ?min, i?max, i?amin and i?min. 2020-03-09 14:59:03 +08:00
njutcz
d2d16d091e Merge pull request #1 from xianyi/develop
update
2020-03-09 10:39:40 +08:00
Martin Kroeker
b6a6ccbbea Merge pull request #2495 from ZuoQ3/develop
add benchmark for axpby test
2020-03-08 08:09:58 +01:00
Martin Kroeker
8b720f7365 Merge pull request #2494 from shengyang-3390/develop
add benchmark for csrot and zdrot
2020-03-07 23:04:21 +01:00
Martin Kroeker
14df234edb Merge pull request #2489 from jijiwawa/brightness
Remove redundant code
2020-03-07 22:26:00 +01:00
s00527847
bbeda55b7b add trmm.c 2020-03-07 13:09:19 -05:00
s00527847
efcf89aec7 Remove redundant code 2020-03-07 12:03:05 -05:00
Martin Kroeker
37d456f7e0 Merge pull request #2493 from martin-frbg/plainmake
Fix use of make vs $(MAKE) in building lapack-testing
2020-03-07 16:55:53 +01:00
Martin Kroeker
0b9e96922b Merge pull request #2488 from liujingjue/develop
Modify the main Makefile in OpenBLAS
2020-03-07 16:52:29 +01:00
zq
0c8162eba6 Add benchmark file axpby.c and modify benchmark/Makefile to test s/d/c/zaxpby 2020-03-07 17:48:55 +08:00
zq
9a94a30132 Merge pull request #1 from xianyi/develop
update
2020-03-07 17:04:59 +08:00
shengyang
09c7a191bd add benchmark for csrot and zdrot
modified:   benchmark/Makefile
	modified:   benchmark/rot.c
2020-03-07 15:17:49 +08:00
l00546269
8a8df530e2 [OpenBLAS]:modifed the Makefile
[Description]: check the compiler version and show the detail info
2020-03-07 10:14:33 +08:00
Martin Kroeker
37f46f2fa0 Fix another spot where make was used instead of $(MAKE)
Broke lapack-testing on BSD as their default "make" does not support GNU Makefile syntax
2020-03-06 15:37:26 +01:00
Martin Kroeker
9afc561be4 Merge pull request #36 from xianyi/develop
rebase
2020-03-06 15:32:27 +01:00
Martin Kroeker
dca3e0cf20 Merge pull request #2491 from chenxuqiang/hbmv_benchmark
benchmark/hpmv&hbmv: add benchmark/hpmv.c and benchmark/hbmv.c
2020-03-06 15:06:42 +01:00
Martin Kroeker
c9f8db979b Merge pull request #2490 from shengyang-3390/develop
Add benchmark file rotm.c and modify benchmark/Makefile to test s/drotm
2020-03-06 15:05:55 +01:00
Martin Kroeker
18099de976 Merge pull request #2487 from jijiwawa/develop
add benchmark for spr/spr2
2020-03-06 14:42:25 +01:00
Martin Kroeker
97c36ca58c Merge branch 'develop' into develop 2020-03-06 14:41:40 +01:00
Martin Kroeker
9f5a74f3c7 Merge pull request #2486 from qqqil/develop
add benchmark for trsv
2020-03-06 14:30:09 +01:00
Martin Kroeker
2afb10975d Merge pull request #2485 from Darkness303/develop
Add syr2 benchmark
2020-03-06 14:29:27 +01:00
Martin Kroeker
dbef479227 Merge pull request #2469 from AGSaidi/acq-rel-2
Use acq/rel semantics to pass flags/pointers in getrf_parallel.
2020-03-06 14:28:58 +01:00
Ali Saidi
208c7e7ca5 Use acq/rel semantics to pass flags/pointers in getrf_parallel.
The current implementation has locks, but the locks each only
have a critical section of one variable so atomic reads/writes
with barriers can be used to achieve the same behavior.

Like the previous patch, pthread_mutex_lock isn't fair, so in a
tight loop the previous thread that has the lock can keep it
starving another thread, even if that thread is about to write
the data that will stop the current thread from spinning.

On a 64c Arm system this improves performance by 20x on sgesv.goto.
2020-03-06 06:22:31 +00:00
chenxuqiang
32c847df45 benchmark/hpmv&hbmv: add benchmark/hpmv.c and benchmark/hbmv.c
Signed-off-by: Xuqiang Chen chenxuqiang3@hisilicon.com
2020-03-06 01:02:02 -05:00
shengyang
e0df9485d4 Add benchmark file rotm.c and modify benchmark/Makefile to test s/drotm
modified:   benchmark/Makefile
	new file:   benchmark/rotm.c
2020-03-05 10:05:59 +08:00
s00527847
0f1a2b12f9 add benchmark for spr/spr2 2020-03-04 15:50:19 -05:00
q00437336
233838b4bc change clock to CLOCK_PROCESS_CPUTIME_ID 2020-03-04 03:54:40 -05:00
l00546269
13f9afbd99 [OpenBLAS]:modifed the Makefile
[Description]:add c/fortran compiler version information in final note
2020-03-04 16:47:23 +08:00
q00437336
de74e11641 add benchmark for trsv 2020-03-04 03:23:22 -05:00
Martin Kroeker
ad9e53154d Merge pull request #2484 from RajalakshmiSR/power-dynamic
Fix DYNAMIC_ARCH build for POWER9
2020-03-04 08:06:06 +01:00
Martin Kroeker
6e70621b0d Merge pull request #2483 from aaawuanjun/develop
Add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv
2020-03-04 07:59:56 +01:00
Martin Kroeker
e6edb7431f Merge pull request #2466 from AGSaidi/acq-rel-1
Switch blas_server to use acq/rel semantics
2020-03-04 07:59:31 +01:00
Darkness303
114dbec947 1.Add syr2 benchmark
2.Fixed some errors
2020-03-04 14:09:10 +08:00
Martin Kroeker
d68e4ba59b Fix cut/paste glitch 2020-03-03 21:37:48 +01:00
Martin Kroeker
635c9e4e09 Restore initializers for mutex and conditional 2020-03-03 21:04:12 +01:00
Rajalakshmi Srinivasaraghavan
2afc074803 Fix DYNAMIC_ARCH build for POWER9
Setting DYNAMIC_ARCH=1 on POWER9 does not build POWER9 files due to some
compiler version checks.  This patch fixes some of the macros that are used
to check compiler version.  On fixing those checks, there are some new make
failures related to icamin, icamax, isamin, isamax and caxpy files on POWER9.
This patch fixes those failures as well.
2020-03-03 12:35:10 -06:00
wuanjun 00447568
5d6c688a7e Merge branch 'develop' of https://github.com/aaawuanjun/OpenBLAS into develop 2020-03-03 19:03:57 +08:00
wuanjun 00447568
87baf9cfe6 Merge branch 'develop' of https://github.com/aaawuanjun/OpenBLAS into develop 2020-03-03 19:03:28 +08:00
wuanjun 00447568
c0ca7d6258 Merge branch 'develop' of https://github.com/aaawuanjun/OpenBLAS into develop 2020-03-03 17:39:26 +08:00
wuanjun 00447568
f682d19ed4 [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv 2020-03-03 17:37:33 +08:00
wuanjun 00447568
790d50fbba [OpenBlas]: add benchmark file trmv.c and modify benchmark/Makefile to test s/d/c/ztrmv 2020-03-03 17:13:49 +08:00
Martin Kroeker
59243d49ab Merge pull request #2479 from Darkness303/develop
Fix potential index overflows at large matrix sizes in the benchmark codes
2020-03-03 08:46:49 +01:00
Martin Kroeker
d41f83e128 Merge pull request #2436 from marxin/improve-utest-coverage
Improve test coverage for utests.
2020-03-03 08:43:00 +01:00
Martin Kroeker
ee4ca7ca6b Merge pull request #2481 from ChinouneMehdi/fix2480
Fix #2480
2020-03-02 21:21:29 +01:00
Martin Kroeker
e326c89ae8 Merge pull request #2478 from MacChen02/develop
Update benchmark statistical time function
2020-03-02 21:20:51 +01:00
مهدي شينون (Mehdi Chinoune)
21f6c4b5a9 fixes #2480 2020-03-02 17:22:28 +01:00
Martin Liska
7ca4ffdbdd Improve test coverage for utests. 2020-03-02 13:38:17 +01:00
jianghesong
0f65c05cd1 fix core dumped error 2020-03-02 19:13:45 +08:00
MacChen02
917d243580 Update benchmark statistical time function
The function gettimeofday does not count the time,when testing the axpy small data volume use case.
Use the function clock_gettime to replace the gettimeofday function to count the time.
2020-03-02 14:36:27 +08:00
Ali Saidi
43c2e845ab Switch blas_server to use acq/rel semantics
Heavy-weight locking isn't required to pass the work queue
pointer between threads and simple atomic acquire/release
semantics can be used instead. This is especially important as
pthread_mutex_lock() isn't fair.

We've observed substantial variation in runtime because of the
the unfairness of these locks which complety goes away with
this implementation.

The locks themselves are left to provide a portable way for
idling threads to sleep/wakeup after many unsuccessful iterations
waiting.
2020-03-02 02:52:49 +00:00
Martin Kroeker
33f76a6c37 Version 0.3.9 2020-03-02 00:10:20 +01:00
Martin Kroeker
960dec234f Version 0.3.9 2020-03-02 00:09:49 +01:00
Martin Kroeker
6b92979f35 Merge pull request #2476 from xianyi/develop
Update from develop in preparation for 0.3.9
2020-03-02 00:08:32 +01:00
Martin Kroeker
014fc13995 Merge pull request #2475 from martin-frbg/039changes
Update ChangeLog for 0.3.9
2020-03-02 00:04:26 +01:00
Martin Kroeker
f1e05676a0 Merge pull request #2474 from martin-frbg/p9be
Use POWER8 kernels on big-endian POWER9 for now
2020-03-02 00:04:08 +01:00
Martin Kroeker
d221c50f27 Add Ampere EMAG8180 2020-03-02 00:02:36 +01:00
Martin Kroeker
f14013da7f Update with 0.3.9 changes 2020-03-02 00:01:22 +01:00
Martin Kroeker
4f371b0fbf Use POWER8 kernels on big-endian POWER9 for now 2020-03-01 23:45:58 +01:00
Martin Kroeker
02d60c1563 Merge pull request #35 from xianyi/develop
rebase
2020-03-01 23:44:10 +01:00
Martin Kroeker
2e6963259b Merge pull request #2471 from AGSaidi/l3-fix-2
Fix barriers in level3_thread
2020-03-01 19:41:07 +01:00
Martin Kroeker
e94590e400 Merge pull request #2468 from AGSaidi/wfe
Use wait-for-event to not spin in the blas_lock
2020-03-01 19:40:46 +01:00
Martin Kroeker
69d4687142 Merge pull request #2464 from Darkness303/develop
Add syr benchmark
2020-03-01 13:02:34 +01:00
Martin Kroeker
a731a9bbb9 Merge pull request #2467 from AGSaidi/rpcc
Make rpcc() on arm64 get closer to what x86 returns
2020-02-29 22:43:02 +01:00
Martin Kroeker
1aa5907a2c Merge pull request #2463 from martin-frbg/mingwfix
Apply MinGW AVX512 compilation fix to fortran options as well
2020-02-29 19:08:03 +01:00
Martin Kroeker
ea8eec5d17 Merge pull request #2422 from wjc404/develop
Adjust SkylakeX GEMM3M parameters, add an AVX512 STRMM kernel and fix performance bugs in AVX2 s/c/z GEMM
2020-02-29 19:07:35 +01:00
Ali Saidi
97ce6bbce2 Fix barriers in level3_thread 2020-02-29 17:45:17 +00:00
Martin Kroeker
a9aeb6745c Merge pull request #2465 from AGSaidi/neoverse-n1
Add Neoverse-N1 core
2020-02-29 13:24:44 +01:00
Ali Saidi
0af9991cc9 Use wait-for-event to not spin in the blas_lock 2020-02-29 04:23:48 +00:00
Ali Saidi
19f3a4091c Make rpcc() on arm64 get closer to what x86 returns
The Arm implementation of rpcc() uses the architected timer
which is defined by the SBSA to be between 10-400MHz. These numbers
are much smaller than the cycle counter frequency used by x86. Make
the numbers closer by shifting the cycle counter up by the number of
leading zeros in the cntfrq_el0 register which gets us closer to a
noraml cpu clock cycle range.
2020-02-29 04:23:22 +00:00
Ali Saidi
c623a965f9 Add Neoverse-N1 core
The implementation is a hybird of the ARMV8 one with some of the
improved TX2 rountines along with specifying -march=v8.2-a
2020-02-29 03:22:04 +00:00
j00520245
e1062400c4 New add syr benchmark 2020-02-28 16:36:53 +08:00
Martin Kroeker
a66f4d80c8 Apply MinGW AVX512 compilation fix to fortran options as well
original issue was #1708, I see now that the same problem affects gfortran compilation. The underlying issue is said to be fixed (but not yet released) on all branches of gcc as of a few days ago but it will certainly take time to reach mingw/msys.
2020-02-27 23:09:40 +01:00
wjc404
dd22eb7621 Update cgemm_kernel_8x2_haswell.c 2020-02-27 22:26:15 +08:00
wjc404
2352331e60 Update zgemm_kernel_4x2_haswell.c 2020-02-27 22:25:19 +08:00
Martin Kroeker
430ee31e66 Merge pull request #2447 from martin-frbg/issue2446
Always select ARMV8 parameters for big servers when cpu is TSV110 or EMAG8180
2020-02-27 15:07:02 +01:00
Martin Kroeker
8164fd1328 Always assume server-class cpu count for TSV110 and EMAG8180 2020-02-26 22:19:57 +01:00
Martin Kroeker
531c6b96d6 Merge pull request #34 from xianyi/develop
rebase
2020-02-26 22:16:28 +01:00
wjc404
1b980001dd Update zgemm_kernel_4x2_haswell.c 2020-02-26 18:38:12 +08:00
wjc404
2515e1152f Update cgemm_kernel_8x2_haswell.c 2020-02-26 18:36:54 +08:00
Martin Kroeker
ddcbed6690 Merge pull request #2437 from martin-frbg/issue2434
[WIP] Add support for Ampere EMAG8180 ARMV8 cpu
2020-02-25 18:42:52 +01:00
Martin Kroeker
f8ec538c82 Add Ampere EMAG8180 2020-02-25 14:30:00 +01:00
Martin Kroeker
ca4f7dceff Add parameters for EMAG8180 DYNAMIC_ARCH support with cmake 2020-02-24 20:23:18 +01:00
Martin Kroeker
1ddf9f1067 Add EMAG8180 to arm64 DYNAMIC_ARCH list for cmake 2020-02-24 20:16:18 +01:00
Martin Kroeker
4c5fac5a2b Typo fix 2020-02-24 20:15:04 +01:00
Martin Kroeker
320e2648cd Add EMAG8180 to DYNAMIC_CORE list for ARM64 2020-02-24 19:23:46 +01:00
Martin Kroeker
9b732696c6 Add DYNAMIC_ARCH support for ARMV8 EMAG8180 2020-02-24 19:20:00 +01:00
Martin Kroeker
c9dcb3d4a4 Merge pull request #2443 from aaawuanjun/develop
[OpenBlas]:benchmark/copy.c has time,x,y data loop problems
2020-02-24 13:14:51 +01:00
Martin Kroeker
3bb7f0138e Merge pull request #2442 from martin-frbg/lapackpr390
Apply fix from Reference-LAPACK PR 390
2020-02-24 12:27:01 +01:00
wuanjun 00447568
c93ae92579 [OpenBlas]:benchmark/copy.c has time,x,y data loop problems 2020-02-24 11:23:39 +08:00
Martin Kroeker
87ac1ceb0b Apply fix from Reference-LAPACK PR390, NaN not propagating 2020-02-23 22:40:40 +01:00
Martin Kroeker
9e40c080f2 Apply fix from Reference-LAPACK PR390, NaN not propagating 2020-02-23 22:39:01 +01:00
wjc404
903854c168 Add files via upload 2020-02-22 23:40:02 +08:00
wjc404
a2ff577a30 Update KERNEL.ZEN 2020-02-22 23:39:43 +08:00
wjc404
97a32cb0a5 Update KERNEL.HASWELL 2020-02-22 23:39:20 +08:00
wjc404
f1746e7284 Delete sgemm_kernel_8x4_haswell_2.c 2020-02-22 23:38:48 +08:00
wjc404
f6fcbd7906 Fix performance bug when LDC is a multiple of 1024 2020-02-22 23:37:45 +08:00
Martin Kroeker
1e8410f18c Merge pull request #2441 from martin-frbg/ismin2
Add proper defaults for the IxMIN/IxMAX kernels on mips64 and power
2020-02-22 11:21:03 +01:00
Martin Kroeker
07454bf4d5 Add proper defaults for IxMIN/IxMAX kernels
the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations
2020-02-21 11:58:15 +01:00
Martin Kroeker
4046985913 Add proper defaults for IxMIN/IxMAX kernels
the fallbacks from Makefile.L1 assume a combined source for absolute value and non-absolute (with ifdef USE_ABS) but here we have separate implementations
2020-02-21 11:55:52 +01:00
Martin Kroeker
75577f95a7 Merge pull request #33 from xianyi/develop
rebase
2020-02-21 09:56:05 +01:00
Martin Kroeker
33d92c7a37 Merge pull request #2435 from martin-frbg/issue2433
Fix handling of ppc endianness
2020-02-21 00:01:58 +01:00
Martin Kroeker
e57b11acca Add preliminary support for EMAG8180 2020-02-19 19:00:28 +01:00
Martin Kroeker
71e5669c3e Add preliminary support for EMAG8180 ARMV8 processor 2020-02-19 18:57:26 +01:00
Martin Kroeker
e8d82c01d4 Recognize Ampere EMAG8180 2020-02-19 18:49:13 +01:00
Martin Kroeker
0b39cf95b0 Fix endianness conditionals 2020-02-19 18:09:54 +01:00
Martin Kroeker
76b2cec6ce Get endianness into Makefile variable 2020-02-19 18:08:20 +01:00
Martin Kroeker
276c1791ea Merge pull request #32 from xianyi/develop
rebase
2020-02-19 18:06:39 +01:00
Martin Kroeker
c5bbfd8fee Merge pull request #2432 from isuruf/install_name
Fix install name on osx again
2020-02-19 08:14:28 +01:00
Isuru Fernando
130c1741e5 Fix install name on osx again 2020-02-18 10:22:49 -08:00
Martin Kroeker
8f782f0673 Merge pull request #2426 from zbeekman/nightly-homebrew-check
Nightly homebrew check
2020-02-18 12:09:15 +01:00
Martin Kroeker
6a517dcb6a Merge pull request #2427 from martin-frbg/powermin
Fix ISMIN and ISMAX kernel choices for POWER8
2020-02-18 08:15:02 +01:00
Martin Kroeker
9f39f0a2c3 Specify ismin/ismax assembly kernels for POWER8 directly
to fix utest failure in new ismin test -  Makefile.L1 defaults look wrong
2020-02-17 19:55:39 +01:00
Izaak Beekman
1a88c4ab26 Fix bottle upload problem & typo 2020-02-17 13:36:17 -05:00
Izaak Beekman
0b44802164 Test push & PRs only when workflow file changes
Also, add comments to clarify what the test is testing
2020-02-17 13:22:09 -05:00
Izaak Beekman
2c242b4cef Add Github Action to build development branch nightly with Homebrew 2020-02-17 12:36:37 -05:00
Martin Kroeker
0bfb7336d2 Merge pull request #2424 from isuruf/osx
Fix building on osx
2020-02-17 17:00:08 +01:00
Martin Kroeker
403cde104e Merge pull request #30 from xianyi/develop
rebase
2020-02-17 14:53:46 +01:00
Martin Kroeker
634f2bddda Merge pull request #2414 from marxin/fix-iamax_sse-implementation
Fix iamax sse implementation and add utests
2020-02-17 14:50:18 +01:00
Martin Liska
aeea14ee40 Come up with LOAD_AND_COMPARE_TO_MXX macro in iamax_sse.S. 2020-02-17 09:01:53 +01:00
Martin Liska
18bcc36a69 Fix implementation of iamax_sse.S as reported in #2116.
The was a typo in iamax_sse.S where one of the comparison
was cmpeqps instead of cmpeqss. That misdetected index
for sequences where the minimum value was 0.
2020-02-17 09:01:53 +01:00
Martin Liska
0e7f43c898 Add missing USE_MIN in kernel/CMakeLists.txt. 2020-02-17 09:01:53 +01:00
Martin Kroeker
79e201fbba Merge pull request #2423 from xianyi/issue2419
Restore -march flag for Android builds
2020-02-17 07:24:02 +01:00
Isuru Fernando
4326dcb460 Pass CFLAGS from env to Makefile.prebuild and remove iOS hack 2020-02-16 15:13:01 -06:00
Martin Kroeker
e32f3b1447 Restore -march flag for Android builds
fixes #2419 - renewed discussion in #2112 suggests removal of the option was primarily aimed at non-Android builds
2020-02-16 17:32:13 +01:00
Martin Kroeker
d483e9270a Update KERNEL.POWER8 2020-02-16 17:29:35 +01:00
Martin Kroeker
01834aee33 Merge pull request #29 from xianyi/develop
rebase
2020-02-16 17:28:10 +01:00
wjc404
b0558c11b9 Update param.h 2020-02-16 23:01:31 +08:00
wjc404
f566787e6e Update KERNEL.SKYLAKEX 2020-02-16 22:58:44 +08:00
wjc404
e3368cbf18 AVX512 STRMM kernel 2020-02-16 22:58:00 +08:00
Martin Kroeker
d92bd5be24 Update KERNEL.POWER8 2020-02-15 23:07:50 +01:00
Martin Kroeker
46e4b12946 Update KERNEL.POWER8 2020-02-15 23:06:51 +01:00
Martin Kroeker
5e94aa4877 Merge pull request #2417 from marxin/make-ctest-verbose-for-drone
Make ctest verbose for drone
2020-02-15 21:57:41 +01:00
Martin Kroeker
93f3e27574 Merge pull request #2415 from marxin/add-cmake-to-gitignore
Add CMake related files to .gitignore.
2020-02-15 21:57:03 +01:00
Martin Kroeker
785c389b0e Merge pull request #2420 from martin-frbg/issue2396
Correct generation of GETRF files by the CMAKE build
2020-02-15 21:56:16 +01:00
Martin Kroeker
c222b25b81 Correct generation of GETRF files by the CMAKE build
fixes #2396
2020-02-15 19:29:14 +01:00
Martin Kroeker
221da8bf05 Merge pull request #2411 from martin-frbg/fix2254-038
Fix pre-processed POWER8 codes and wrong conditionals in the POWER8,PPC440 and PPC970 KERNEL files
2020-02-14 23:07:43 +01:00
Martin Liska
eb285b4d20 Make ctest verbose for drone builder. 2020-02-14 10:46:37 +01:00
Martin Kroeker
cafdd999b8 Update caxpy_power8.S 2020-02-13 22:44:09 +01:00
Martin Kroeker
92ca92a46c Update caxpy_power8.S 2020-02-13 21:24:54 +01:00
Martin Kroeker
486c35c5dc Update icamin_power8.S 2020-02-13 18:38:43 +01:00
Martin Liska
0e05ea9bac Add CMake related files to .gitignore. 2020-02-13 14:51:55 +01:00
Martin Kroeker
5ba3699f41 Update isamin_power8.S 2020-02-13 00:00:32 +01:00
Martin Kroeker
8eefa530cd Update isamax_power8.S 2020-02-12 23:59:50 +01:00
Martin Kroeker
de40d47edf Update isamin_power8.S 2020-02-12 23:57:48 +01:00
Martin Kroeker
7c162b8a21 Update isamax_power8.S 2020-02-12 23:56:57 +01:00
Martin Kroeker
0544cbc806 Fix syntax of endianness conditional 2020-02-12 20:00:29 +01:00
Martin Kroeker
120d20731f Fix syntax of endianness conditional 2020-02-12 19:58:42 +01:00
Martin Kroeker
dc345d84df Fix syntax of endianness conditional and add gcc version check for workaround 2020-02-12 19:56:52 +01:00
Martin Kroeker
616921fd91 Merge pull request #27 from xianyi/develop
rebase
2020-02-12 19:16:14 +01:00
Martin Kroeker
8a9e9a82a1 Merge pull request #2410 from bartoldeman/fix-dscal-inline-asm
Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408
2020-02-12 15:38:37 +01:00
Bart Oldeman
7ea5e07d1c Fix inline asm in dscal: mark x, x1 as clobbered. Fixes #2408
The leaq instructions in dscal_kernel_inc_8 modify x and x1 so they
must be declared as input/output constraints, otherwise the compiler
may assume the corresponding registers are not modified.
2020-02-12 14:11:44 +00:00
Martin Kroeker
cb6ef49857 Merge pull request #2407 from susilehtola/patch-2
Patch out instances of Z15 in dynamic_zarch.c
2020-02-11 13:04:44 +01:00
Martin Kroeker
63994e1cdb Merge pull request #2405 from susilehtola/patch-1
Fix typo in dynamic_zarch.c
2020-02-11 13:03:35 +01:00
Martin Kroeker
496e3019bc Merge pull request #2404 from martin-frbg/issue2395
Fix spurious application of USE_TRMM in cmake builds
2020-02-11 13:00:36 +01:00
Martin Kroeker
169be3f097 Merge pull request #2403 from martin-frbg/issue2400
Fix coretype identification of Intel Cannon Lake, Ice Lake and Goldmont
2020-02-11 13:00:16 +01:00
Martin Kroeker
6ccbb089c2 Merge pull request #2402 from gxw-loongson/develop
Avoid printing the following information on mips and mips64 when check msa
2020-02-11 12:59:53 +01:00
Martin Kroeker
59ebe3636a Merge pull request #2399 from martin-frbg/buffersize
Make BUFFER_SIZE configurable at build time
2020-02-11 12:56:56 +01:00
Susi Lehtola
5a6bba3061 Patch out instances of Z15 in dynamic_zarch.c
There does not appear to be a Z15 kernel yet, causing link errors from the code. This patch fixes the issue.
2020-02-11 15:07:33 +13:00
Susi Lehtola
dff173e50e Fix typo in dynamic_zarch.c 2020-02-11 14:46:30 +13:00
Martin Kroeker
7e5cbb6f35 Fix bad conditional syntax that caused spurious application of USE_TRMM 2020-02-10 21:17:39 +01:00
Martin Kroeker
303bdb673b Fix coretype detection for Intel extended models 6 and 7
affecting Goldmont, Cannon Lake, Ice Lake autodetection
2020-02-10 19:17:32 +01:00
gxw
754433f420 Avoid printing the following information on mips and mips64 when check msa:
"unrecognized command line option ‘-mmsa’"
2020-02-10 19:11:45 +08:00
Martin Kroeker
7f0d523b42 Make BUFFER_SIZE configurable 2020-02-09 23:32:57 +01:00
Martin Kroeker
c353d8b106 Make BUFFER_SIZE configurable 2020-02-09 23:30:22 +01:00
Martin Kroeker
579be3aa9d Add configuration option for BUFFER_SIZE 2020-02-09 23:28:04 +01:00
Martin Kroeker
449e8ea443 Merge pull request #26 from xianyi/develop
rebase
2020-02-09 23:23:55 +01:00
Martin Kroeker
3bec250cf9 Increment version to 0.3.9.dev 2020-02-09 23:18:44 +01:00
Martin Kroeker
f03dd23e90 Increment version to 0.3.9.dev 2020-02-09 23:18:07 +01:00
Martin Kroeker
fb5eb47558 Merge pull request #2398 from xianyi/develop
Update from develop in preparation of the 0.3.8 release
2020-02-09 23:16:28 +01:00
Martin Kroeker
fa93d63365 Merge branch 'release-0.3.0' into develop 2020-02-09 23:16:06 +01:00
Martin Kroeker
90e6c66a57 Merge pull request #2397 from martin-frbg/038changes
Update Changelog with changes from 0.3.8
2020-02-09 23:01:52 +01:00
Martin Kroeker
32d97330b3 Update with changes from 0.3.8 2020-02-09 23:00:36 +01:00
Martin Kroeker
29eaf4b6d7 Merge pull request #25 from xianyi/develop
rebase
2020-02-09 22:48:15 +01:00
Martin Kroeker
47c1bf7f4d typo fixes 2020-02-09 01:06:40 +01:00
Martin Kroeker
2b55f0ad30 Merge pull request #2393 from martin-frbg/issue2388
Provide more documentation in README.md
2020-02-09 01:00:33 +01:00
Martin Kroeker
a5b32ab06c Merge pull request #2390 from martin-frbg/pgi
Small corrections for compilation with PGI compilers
2020-02-09 00:13:40 +01:00
Martin Kroeker
50545b19d0 Update CPU and OS support and document DYNAMIC_ARCH option in README.md
prompted by #2388
2020-02-09 00:06:07 +01:00
Martin Kroeker
b3cbd60d7a Remove PGI from list again as it is actually still not capable 2020-02-08 10:20:13 +01:00
Martin Kroeker
70199d1905 Merge pull request #2389 from Zeyiii/develop
Fix bugs in benchmark of gemv
2020-02-07 16:05:46 +01:00
Martin Kroeker
cfe63d8cc2 Remove OpenMP libraries from link list 2020-02-07 16:03:51 +01:00
Martin Kroeker
d55b10830f Remove OpenMP libraries from link list 2020-02-07 16:02:17 +01:00
Martin Kroeker
c1c10cbb21 Merge pull request #2384 from wjc404/develop
Optimize AVX512 DGEMM (& DTRMM)
2020-02-07 13:47:12 +01:00
Martin Kroeker
5989841524 Add PGI to avx512-supporting compilers 2020-02-07 13:01:31 +01:00
Martin Kroeker
68a43db358 Fix utest compilation with PGI 2020-02-07 10:15:18 +01:00
Martin Kroeker
9694037b23 Set SUFFIX in tempfile commands, fix bad architecture option for PGI compiler in avx512 test 2020-02-07 10:09:25 +01:00
Martin Kroeker
71faa1c1a7 Merge pull request #24 from xianyi/develop
rebase
2020-02-07 10:03:02 +01:00
wjc404
3447d04eaf Update dgemm_kernel_16x2_skylakex.c 2020-02-06 02:14:10 +00:00
wjc404
8b5cdcc64c Update sgemm_kernel_8x4_haswell.c 2020-02-06 01:47:46 +00:00
wjc404
4e00d96a78 Update dgemm_kernel_16x2_skylakex.c 2020-02-06 01:46:36 +00:00
w00421467
ce9ea8f826 Fix another branch 2020-02-05 15:07:18 +08:00
w00421467
0b909203cb Fix bugs in benchmark of gemv 2020-02-05 14:53:37 +08:00
wjc404
096da2f51a Update dgemm_kernel_16x2_skylakex.c 2020-02-05 13:36:57 +08:00
wjc404
2f96a2c55b Update trmm_R.c 2020-02-05 10:15:02 +08:00
wjc404
833bd0f8ff Update trmm_L.c 2020-02-05 10:09:41 +08:00
wjc404
77b8f49556 Update level3_thread.c 2020-02-04 20:33:08 +08:00
wjc404
1c3e20ce48 Update level3.c 2020-02-04 20:30:23 +08:00
wjc404
83b6be7976 Update param.h 2020-02-04 19:55:26 +08:00
wjc404
081b188529 Update KERNEL.SKYLAKEX 2020-02-03 21:38:08 +08:00
wjc404
f3f969f681 Update param.h 2020-02-03 21:34:12 +08:00
wjc404
8019e70211 AVX512 16x2 DGEMM kernel 2020-02-03 21:32:56 +08:00
Martin Kroeker
8d2a796f49 Merge pull request #2378 from martin-frbg/issue2377
Add -march option for AVX512 in cmake as well
2020-01-30 17:07:19 +01:00
Martin Kroeker
8dc9fd4dfe Add -march option for AVX512 2020-01-30 12:41:18 +01:00
Martin Kroeker
abc67bdd74 Merge pull request #2375 from ewanglong/master
fix a few performance drop in some matrix size per data type
2020-01-30 10:27:29 +01:00
Martin Kroeker
1f62a82789 Merge pull request #2376 from wjc404/develop
Fix remaining bugs in parallel GEMM3M
2020-01-23 21:50:19 +01:00
wjc404
e9fb8f62b1 Update level3_gemm3m_thread.c 2020-01-22 17:40:03 +00:00
Wang,Long
fbf4f48f4a fix a few performance drop in some matrix size per data type
Signed-off-by: Wang,Long <long1.wang@intel.com>
2020-01-22 15:15:04 +00:00
Martin Kroeker
b9ad450295 Merge pull request #2373 from Qiyu8/optimize#gemmbeta
Optimize genenal Gemm Beta
2020-01-21 15:05:38 +01:00
Martin Kroeker
e011ad820a Merge pull request #2372 from martin-frbg/winexit
Do not run any cleanup if the program is exiting anyway
2020-01-21 14:56:45 +01:00
Qiyu8
ff42e68652 Optimize genenal Gemm Beta 2020-01-20 11:49:42 +08:00
Martin Kroeker
23f322f997 Do not run any cleanup if the program is exiting anyway
From keno's PR #2350 - this avoids the potential hang in blas_thread_shutdown where we may wait for threads to exit while they are waiting on the loader lock from DllMain
2020-01-19 13:28:27 +01:00
Martin Kroeker
093d37de8d Merge pull request #2371 from martin-frbg/issue2370
Free Windows thread memory with MEM_RELEASE rather than MEM_DECOMMIT
2020-01-18 20:39:34 +01:00
Martin Kroeker
d65e9a2bbd Merge pull request #2253 from thrasibule/xerbla
fix error messages
2020-01-18 20:39:04 +01:00
Martin Kroeker
78100b8093 Free Windows thread memory with MEM_RELEASE rather than MEM_DECOMMIT
as suggested by hjmndv in #2370
2020-01-18 15:06:39 +01:00
Martin Kroeker
70f45749b9 Merge pull request #2367 from wjc404/develop
Improve paralleled SGEMM performance on SKYLAKEX CPUs
2020-01-15 21:13:43 +01:00
wjc404
e5dcdeb550 Update sgemm_direct_skylakex.c 2020-01-13 16:59:23 +08:00
wjc404
952cc2ba38 Update sgemm_kernel_16x4_skylakex_2.c 2020-01-13 16:58:54 +08:00
wjc404
feaafbedd3 make skylakex sgemm code more friendly for readers
BTW some kernels were adjusted to improve performance
2020-01-13 16:28:41 +08:00
wjc404
1c67567008 improve skylakex paralleled sgemm performance 2020-01-13 16:26:03 +08:00
Martin Kroeker
4e979bf75b Merge pull request #2366 from martin-frbg/install390
Add new file lapack.h from LAPACK 3.9.0 to installable headers
2020-01-13 09:00:21 +01:00
Martin Kroeker
daa4310db5 Install new lapack.h
new file in LAPACK 3.9.0, split off from lapacke.h
2020-01-12 22:00:50 +01:00
Martin Kroeker
b8f3605132 Merge pull request #23 from xianyi/develop
rebase
2020-01-12 21:57:23 +01:00
Martin Kroeker
b36018be6d Merge pull request #2365 from wjc404/develop
Fix SKYLAKEX STRMM issues
2020-01-09 23:23:09 +01:00
wjc404
3a100b2797 Update KERNEL.SKYLAKEX 2020-01-09 13:48:41 +08:00
Martin Kroeker
38742d5547 Merge pull request #2361 from wjc404/develop
Optimize AVX2 SGEMM & STRMM
2020-01-08 16:20:28 +01:00
wjc404
bd4c032f52 Update sgemm_kernel_8x4_haswell.c 2020-01-07 11:22:46 +08:00
wjc404
9dc9b7b95e Update sgemm_kernel_8x4_haswell.c 2020-01-06 20:11:36 +08:00
wjc404
9f5cdc49d4 Update CONTRIBUTORS.md 2020-01-06 12:28:43 +08:00
wjc404
b7b408a120 optimize AVX2 SGEMM 2020-01-06 12:16:09 +08:00
wjc404
92b10212de optimize AVX2 SGEMM 2020-01-06 12:11:21 +08:00
wjc404
b73bf01378 optimize AVX2 SGEMM 2020-01-06 12:09:14 +08:00
wjc404
eb3c9f1db9 optimize AVX2 SGEMM 2020-01-06 12:07:02 +08:00
Martin Kroeker
fd2ff2714f Merge pull request #2359 from martin-frbg/lapack-pr330
Apply LAPACKE fix for eigenvector transposition in symmetric eigensolvers
2020-01-03 15:03:30 +01:00
Martin Kroeker
2ea2bd99c7 Apply LAPACKE fix for eigenvector transposition in symmetric eigensolvers
from Reference-LAPACK PR 330
2020-01-03 11:10:00 +01:00
Martin Kroeker
fbb894948c Merge pull request #22 from xianyi/develop
rebase
2020-01-03 10:23:25 +01:00
Martin Kroeker
e711659c90 Merge pull request #2358 from shengyang-3390/develop
Test all 7 declared values of N in float and double cblas3 tests
2020-01-03 09:02:03 +01:00
shengyang
893e6e57c4 modified: ctest/din3 ctest/sin3 2020-01-03 10:03:33 +08:00
Martin Kroeker
456ee2e1f0 Merge pull request #2357 from chenxuqiang/dgemm_beta_zero
kernel/arm64/dgemm_beta.S: add beta == zero branch
2020-01-02 22:28:36 +01:00
Martin Kroeker
9998f8ed8b Merge pull request #2356 from shengyang-3390/develop
Use arm neon instructions to optimize ncopy operation (and enable 7th column in float+complex cblas3 test drivers)
2020-01-02 22:27:44 +01:00
shengyang
80db5f11e1 update 2020-01-02 11:01:57 +08:00
chenxuqiang
52de4cc8fd kernel/arm64/dgemm_beta.S: add beta == zero branch
added beta == zero branch, and no need to load C matrix.

Signed by: Xuqiang Chen <chenxuqiang3@hisilicon.com>
2020-01-01 21:50:45 -05:00
Martin Kroeker
44028581cc Merge pull request #2355 from Zeyiii/dev-zeyi2
Use arm neon instructions to optimize sgemm_beta operation
2020-01-01 22:14:16 +01:00
Martin Kroeker
86ab939936 Merge pull request #2354 from ZuoQ3/develop
[WIP] Use arm neon instructions to optimize tcopy operation
2020-01-01 22:13:37 +01:00
Martin Kroeker
375b1875c8 [WIP] Update LAPACK to 3.9.0 (#2353)
* Update make.inc entries for LAPACK 3.9.0

Reference-LAPACK PR 347 changed some variable names and relative paths

* Update LAPACK to 3.9.0

* Add new functions from LAPACK 3.9.0

* Add new functions from LAPACK 3.9.0

* Restore LOADER command 

as it makes it easier to specify pthread as needed

* Restore LOADER

* Restore EIG/LIN prefixes in cmdbase

* add binary path to lapack_testing.py call

* Restore OpenMP version check

* Restore OpenMP version check

* Restore fix for out-of-bounds array accesses

from #2096
2020-01-01 13:18:53 +01:00
Martin Kroeker
6c85cb1869 Merge pull request #2352 from wjc404/develop
AVX2 ZGEMM3M kernel
2019-12-31 18:08:10 +01:00
Martin Kroeker
995768bbc5 Merge pull request #2351 from Zeyiii/develop
prefetching for dgemm_beta
2019-12-31 18:07:37 +01:00
int_13h
96ad579428 add in runtime cpu detection for zarch (#2349)
add in runtime cpu detection for zarch
2019-12-31 18:03:27 +01:00
shengyang
8d84403205 Use arm neon instructions to optimize ncopy operation
modified:   KERNEL.ARMV8
	modified:   KERNEL.TSV110
	new file:   sgemm_ncopy_4.S
2019-12-31 17:06:35 +08:00
shengyang
8729db117c modified: ctest/din3
modified:   ctest/sin3
2019-12-31 15:59:52 +08:00
w00421467
0833a4846a Use arm neon instructions to optimize sgemm_beta operation 2019-12-31 10:42:03 +08:00
zq
50f7fc1401 [WIP] Use arm neon instructions to optimize tcopy operation 2019-12-31 10:21:23 +08:00
w00421467
d1b53806be Merge remote-tracking branch 'pub/develop' into develop 2019-12-31 10:13:24 +08:00
wjc404
a0f0a802fc Update zgemm3m_kernel_4x4_haswell.c 2019-12-30 17:33:42 +08:00
wjc404
700fe5b5ee Add files via upload 2019-12-30 17:18:59 +08:00
wjc404
bb2729c855 Update CONTRIBUTORS.md 2019-12-30 16:11:37 +08:00
wjc404
aae44d040d Update CONTRIBUTORS.md 2019-12-30 16:10:08 +08:00
wjc404
6362c34ee6 Update param.h 2019-12-30 16:08:19 +08:00
wjc404
f60840c420 Update KERNEL.ZEN 2019-12-30 16:04:23 +08:00
wjc404
109e18cd96 Update KERNEL.HASWELL 2019-12-30 16:03:24 +08:00
wjc404
ae1579be13 Create zgemm3m_kernel_4x4_haswell.c 2019-12-30 16:02:51 +08:00
w00421467
3ccf8885ac prefetching for dgemm_beta 2019-12-30 11:45:49 +08:00
Martin Kroeker
454847588e Update LAPACK to 3.9.0 2019-12-29 21:27:18 +01:00
Martin Kroeker
0257f26488 Merge pull request #21 from xianyi/develop
rebase
2019-12-29 18:08:55 +01:00
Martin Kroeker
c45b7aef14 Merge pull request #2348 from wjc404/develop
AVX2 CGEMM3M kernel
2019-12-28 20:07:56 +01:00
wjc404
312060d0d6 Update CONTRIBUTORS.md 2019-12-27 23:36:13 +08:00
wjc404
cd765f094b Update cgemm3m_kernel_8x4_haswell.c 2019-12-27 18:23:29 +08:00
wjc404
64639f440f Update param.h 2019-12-27 18:06:42 +08:00
wjc404
3a66c8cac1 Update KERNEL.ZEN 2019-12-27 18:04:08 +08:00
wjc404
4c35b8dbaa Update gemm3m_level3.c 2019-12-27 18:03:01 +08:00
wjc404
ed9af2f7da Update KERNEL.HASWELL 2019-12-27 18:01:38 +08:00
wjc404
5fd1edead9 Create cgemm3m_kernel_8x4_haswell.c 2019-12-27 18:00:55 +08:00
Martin Kroeker
26478eb0d0 Merge pull request #2345 from wjc404/develop
Optimize AVX2 CGEMM
2019-12-25 22:26:41 +01:00
wjc404
eeecd623d8 Update cgemm_kernel_8x2_haswell.c 2019-12-24 00:40:16 +08:00
wjc404
3ce6bcdb5f Update CONTRIBUTORS.md 2019-12-24 00:30:16 +08:00
wjc404
6fbe51072b Update CONTRIBUTORS.md 2019-12-24 00:24:40 +08:00
wjc404
611445c7f8 Update param.h 2019-12-23 23:44:55 +08:00
wjc404
2cd9306bb5 Update KERNEL.ZEN 2019-12-23 23:42:30 +08:00
wjc404
c418c81224 Update KERNEL.HASWELL 2019-12-23 23:41:44 +08:00
wjc404
025741f16a Fast Haswell CGEMM kernel 2019-12-23 23:40:03 +08:00
Martin Kroeker
0ae49d2990 Merge pull request #2344 from wjc404/develop
Optimize AVX2 ZGEMM
2019-12-21 12:16:55 +01:00
wjc404
105e26e12a Adjust Haswell ZGEMM blocking parameters 2019-12-21 14:38:51 +08:00
wjc404
f41d52665d Fast Haswell ZGEMM kernel 2019-12-21 14:37:06 +08:00
wjc404
d573d24de7 Fast Haswell ZGEMM kernel 2019-12-21 14:35:15 +08:00
Martin Kroeker
31d6c2eb7d Merge pull request #2340 from Zeyiii/develop
[WIP] Use arm neon instructions to optimize gemm beta operation
2019-12-20 08:38:57 +01:00
w00421467
b7cc69ee62 declare DGEMM_BETA in KERNEL.ARMV8 rather than the generic KERNEL 2019-12-20 10:11:50 +08:00
w00421467
aeef942c4f use arm neon instructions to optimize gemm beta operation 2019-12-17 10:00:13 +08:00
Martin Kroeker
445ca2f418 Merge pull request #2339 from Jehan/wip/Jehan/fix-timeout
driver: more reasonable thread wait timeout on Windows.
2019-12-13 14:57:26 +01:00
Jehan
13226e3101 driver: more reasonable thread wait timeout on Windows.
It used to be 5ms, which might not be long enough in some cases for the
thread to exit well, but then when set to 5000 (5s), it would slow down
any program depending on OpenBlas.

Let's just set it to 50ms, which is at least 10 times longer than
originally, but still reasonable in case of failed thread termination.
2019-12-13 09:52:33 +01:00
Martin Kroeker
1a6ea8ee6d Merge pull request #2338 from kavanabhat/aix_mod
Changes to build on AIX in POWER8 mode
2019-12-09 17:54:49 +01:00
Martin Kroeker
c6ecb195e6 Merge pull request #2337 from martin-frbg/issue2336
Support two-digit version numbers in gcc version check
2019-12-07 09:38:06 +01:00
Martin Kroeker
b28db31429 Support two-digit version numbers in gcc version check
fixes #2336 (non-recognition of gcc 10) with patch provided by JeffreyALaw.
2019-12-06 21:23:56 +01:00
Kavana Bhat
6baa9b07d7 AIX changes for Power8 2019-12-06 04:33:32 -06:00
Martin Kroeker
a4896b5538 Update DYNAMIC_ARCH support for ARM64 and PPC (#2332)
* Update DYNAMIC_ARCH list of ARM64 targets for gmake
* Update arm64 cpu list for runtime detection
* Update DYNAMIC_ARCH list of ARM64 targets for cmake and add POWERPC targets
2019-12-04 11:06:03 +01:00
Kavana Bhat
3938e59569 AIX changes for Power8 2019-12-04 00:23:46 -06:00
Martin Kroeker
9d5079008f Merge pull request #2334 from martin-frbg/fix2228
Remove misplaced file
2019-12-03 22:23:52 +01:00
Martin Kroeker
3518617f5b Add Intel Goldmont+ cpuid
was originally in #2228 but that PR had misplaced the file in the toplevel directory
2019-12-03 08:32:29 +01:00
Martin Kroeker
715f4650d9 Delete stray copy of dynamic.c from PR 2228 2019-12-03 08:24:10 +01:00
Martin Kroeker
10705183ce Merge pull request #20 from xianyi/develop
Rebase
2019-12-03 08:22:40 +01:00
Martin Kroeker
235599f17a Merge pull request #2329 from isuruf/patch-1
Workaround an ICE in clang 9.0.0
2019-12-02 08:30:43 +01:00
Isuru Fernando
b863b32ac5 Workaround an ICE in clang 9.0.0
This bug is not there in 8.x nor in the 9.0 daily snapshot.
2019-12-01 12:59:46 -06:00
Martin Kroeker
dd04143d4a Merge pull request #2328 from martin-frbg/ppc9
Fix precompiled kernels on POWER9 and make their use conditional on (old) gcc version
2019-11-30 12:23:57 +01:00
Martin Kroeker
f3a6164bff Merge pull request #2324 from antonblanchard/power9_segv
Fix SEGV in cdot_power9
2019-11-30 00:03:42 +01:00
Martin Kroeker
dedd822d1a Fix caxpy/caxpyc naming in localentry 2019-11-29 23:56:57 +01:00
Martin Kroeker
2181fb7047 Fix caxpy/caxpyc naming in localentry 2019-11-29 23:54:15 +01:00
Martin Kroeker
a9b62c03f8 Substitute precompiled gcc7 codes only when gcc is older than 9.x 2019-11-29 23:49:50 +01:00
Martin Kroeker
97762234f9 Add variable for gcc >=9 test
used in KERNEL.POWER9
2019-11-29 23:47:23 +01:00
Martin Kroeker
948d11fc51 Merge pull request #19 from xianyi/develop
rebase
2019-11-29 23:44:09 +01:00
Martin Kroeker
c815b8fb85 Merge pull request #2323 from wjc404/develop
some optimizations of AVX512 DGEMM
2019-11-28 20:55:16 +01:00
wjc404
e20709e976 Update param.h 2019-11-28 19:57:50 +08:00
wjc404
934e601e93 Update dgemm_kernel_4x8_skylakex_2.c 2019-11-28 19:56:35 +08:00
Martin Kroeker
a4c3668f99 Merge pull request #2321 from martin-frbg/issue2319
Fix race conditions in multithreaded GEMM3M
2019-11-28 09:30:24 +01:00
Martin Kroeker
867232c6a4 Merge pull request #2327 from martin-frbg/travisosx
Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
2019-11-28 08:43:45 +01:00
Martin Kroeker
5aaf70ef95 Merge pull request #2326 from xianyi/revert-2325-travisosx
Revert "Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now"
2019-11-28 00:17:19 +01:00
Martin Kroeker
ae2a0995cc Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
2019-11-28 00:15:36 +01:00
Martin Kroeker
83dae28ae2 Revert "Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now" 2019-11-28 00:09:06 +01:00
Martin Kroeker
da986d2e83 Merge pull request #2325 from martin-frbg/travisosx
Cleanup Travis IOS xbuild and disable FORTRAN on 32bit and ios builds for now
2019-11-27 21:59:36 +01:00
Martin Kroeker
6bc487de35 Cleanup IOS build and disable FORTRAN on 32bit and ios builds for now
Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
2019-11-27 15:10:57 +01:00
Anton Blanchard
cf2a8e410c Fix SEGV in cdot_power9
We were corrupting r2 because the local entry wasn't being
setup correctly.
2019-11-26 21:55:04 -07:00
wjc404
eb1e9c8c92 some optimizations 2019-11-26 14:12:20 +08:00
Martin Kroeker
f95989cbc1 Fix AVX512 capability test (always returning zero)
from #2322
2019-11-23 22:38:07 +01:00
Martin Kroeker
f3065a0eed Fix race conditions in multithreaded GEMM3M
by adding barriers (and a mutex lock for the non-OpenMP case) like it was already done for GEMM in level3_thread.c some time ago
2019-11-23 19:54:56 +01:00
Martin Kroeker
04226f1e97 Add the cpuid of the business/rackmount version of z15 as well 2019-11-21 18:14:29 +01:00
Martin Kroeker
0925ef70db Merge pull request #2316 from sharkcz/s390x
zarch: treat z15 as z14 instead of generic
2019-11-21 18:03:00 +01:00
Martin Kroeker
371e6f73d4 Merge pull request #2317 from aarnez/develop
Change bad usage of "asum" to "sum" in ZARCH versions of ?sum
2019-11-21 17:59:21 +01:00
Andreas Arnez
d117dfd505 Change bad usage of "asum" to "sum" in ZARCH versions of ?sum
The ZARCH implementations of ?sum contain a cut & paste-error: An inline
assembly argument is named "sum", but the assembly references "asum"
instead.  The mismatch causes a build error.  This is fixed.
2019-11-21 13:49:13 +01:00
Dan Horák
883c39773a zarch: treat z15 as z14 instead of generic
Signed-off-by: Dan Horák <dan@danny.cz>
2019-11-21 12:53:23 +01:00
Martin Kroeker
b09b5be0a4 Merge pull request #2315 from ewanglong/develop
revised fix windows compatible for #2313
2019-11-21 05:06:44 +01:00
Wang, Long
bfb5fbdb4d revised fix windows compatible for #2313
Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-21 10:22:58 +08:00
Martin Kroeker
3da6d66da9 Merge pull request #2314 from Jehan/wip/Jehan/fix-openblas-crash
Fix usage of TerminateThread() causing critical section corruption.
2019-11-20 16:16:35 +01:00
Martin Kroeker
08fa83aba2 Merge pull request #2312 from martin-frbg/power8be
Further Power8 big-endian corrections
2019-11-20 15:12:06 +01:00
Martin Kroeker
63d3ee8dfc Merge pull request #2313 from ewanglong/develop
Fix the integer overflow issue for large matrix size
2019-11-20 14:49:15 +01:00
Wang, Long
1191db1a49 For the sake of windows compatible, used "unsigned long long" to ensure 64-bit length
Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-20 21:30:47 +08:00
Jehan
1f6071590d Fix usage of TerminateThread() causing critical section corruption.
This patch was submitted to the GIMP project by a publisher wishing to
keep confidentiality (hence anonymously). I just pass along the patch.
Here is the patch explanation which came with:

First they remind us what Microsoft documentation says about
TerminateThread:
> TerminateThread is a dangerous function that should only be used in
> the most extreme cases. You should call TerminateThread only if you
> know exactly what the target thread is doing, and you control all of
> the code that the target thread could possibly be running at the time
> of the termination.
(https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-terminatethread)

Then they say that 5 milliseconds time-out might not be long enough for
the thread to exit gracefully. They propose to set it to a much higher
value (for instance here 5 seconds).

And finally you should always check the return value of
WaitForSingleObject(). In particular you want to run TerminateThread()
only if WaitForSingleObject() failed, not on success case.
2019-11-20 13:00:49 +01:00
Wang, Long
0caf1434c9 Fix the integer overflow issue for large matrix size
For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow.
This will lead to wrong branching to single-threading. The performance
is downgraded significantly.

Signed-off-by: Wang, Long <long1.wang@intel.com>
2019-11-20 14:11:17 +08:00
Martin Kroeker
73128f3883 Merge pull request #2310 from martin-frbg/ppc440
Fix PPC440 big-endian support and disable the QCDOC qalloc routine by default
2019-11-17 23:19:48 +01:00
Martin Kroeker
cad0d150db Define alternate kernels for big-endian POWER8 2019-11-17 23:12:10 +01:00
Martin Kroeker
eba0aeb7cd Fix compilation for big-endian POWER8 2019-11-17 22:58:32 +01:00
Martin Kroeker
0c07c356c1 Define alternate kernels for big-endian PPC440 2019-11-17 19:25:08 +01:00
Martin Kroeker
82b75f97e5 Disable the old QCDOC qalloc by default and copy utility functions from memory.c
1. qalloc() appears to have been a special routine written for the PPC440-based QCDOC supercomputer(s) from around 2005, its source does not seem to be readily available. So switch the #if 1 in the code to rely on standard malloc() by default.
2. Utility functions like get_num_procs, get_num_threads that were added to the "normally" used memory.c in the meantime were still missing here.
2019-11-17 19:22:04 +01:00
Martin Kroeker
7887c45077 Merge pull request #17 from xianyi/develop
rebase
2019-11-17 19:09:49 +01:00
Martin Kroeker
3e67017ac8 Merge pull request #2309 from martin-frbg/ppc970-be
Fix PPC970 big-endian support
2019-11-17 18:22:24 +01:00
Martin Kroeker
b3ac6ee222 Define alternate kernels for big-endian PPC970
The altivec versions of SGEMM and CGEMM fail most test in LAPACK-TESTING when compiled for big endian, STRSM/CTRSM even cause segfaults. The rot kernels either fail the corresponding utest or lead to failures in LAPACK-TESTING.
2019-11-17 15:19:39 +01:00
Martin Kroeker
6082e556cd Use "generic" S/CGEMM unroll M on big-endian PPC970
as the respective PPC970 "altivec" kernels give wrong results when compiled for big endian
2019-11-17 15:10:26 +01:00
Martin Kroeker
92315173d5 Merge pull request #2308 from martin-frbg/ctestfix
Fix potential issue in the c/z blas3 ctests
2019-11-15 08:33:17 +01:00
Martin Kroeker
351d12b94e Fix potential spurious failure from uninitialized variable 2019-11-15 00:20:36 +01:00
Martin Kroeker
bf73aa141b Fix potential spurious failure from uninitialized variable 2019-11-15 00:19:24 +01:00
Martin Kroeker
71e96163db Merge pull request #2305 from wjc404/develop
AVX512 CGEMM & ZGEMM kernels
2019-11-12 07:38:37 +01:00
wjc404
819e852ae7 AVX512 CGEMM & ZGEMM kernels
96-99% 1-thread performance of MKL2018
2019-11-11 20:04:52 +08:00
Martin Kroeker
4e466d739c Merge pull request #15 from xianyi/develop
rebase
2019-11-09 18:52:08 +01:00
Martin Kroeker
4c6a457358 Merge pull request #2300 from wjc404/develop
Optimize SGEMM on SKYLAKEX CPUs
2019-11-06 07:27:33 +01:00
wjc404
836c414e22 optimizations of software prefetching 2019-11-05 13:36:56 +08:00
Martin Kroeker
d403eb3c2f Merge pull request #2302 from martin-frbg/ppc970
Disable three-operand DCBT on PPC970 regardless of operating system
2019-11-04 22:55:05 +01:00
Martin Kroeker
3cd97f1a80 Merge pull request #2301 from martin-frbg/ppc8be
Disable IDAMIN/MAX and IZAMIN/MAX optimizations on big-endian POWER8
2019-11-04 22:54:28 +01:00
Martin Kroeker
9955f0996f Merge pull request #2294 from martin-frbg/ios-cleanup
Remove obsolete workarounds for IOS on ARMV8
2019-11-04 22:53:58 +01:00
wjc404
430c11e135 Add files via upload 2019-11-04 20:10:12 +08:00
wjc404
fbacd2605d optimizations via software prefetches 2019-11-04 19:37:19 +08:00
Martin Kroeker
6fa89b06a1 Use the two-operand form of DCBT on all PPC970 regardless of OS
There seems to be no advantage to the three-operand form used in the earliest GotoBLAS kernels, and it causes compilation problems  on other than the previously special-cased platforms as well
2019-11-03 22:55:31 +01:00
Martin Kroeker
68597002ea The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:42:46 +01:00
Martin Kroeker
d2a6285549 The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:41:19 +01:00
Martin Kroeker
d999688d1a The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:39:06 +01:00
Martin Kroeker
928fe1b28e The assembly microkernel is not safe to use on ELFv1 2019-11-03 22:37:27 +01:00
Martin Kroeker
ccc28c6d60 Merge pull request #13 from xianyi/develop
resync with upstream
2019-11-03 22:33:31 +01:00
wjc404
ae43b75a6a Add files via upload 2019-11-02 10:09:19 +08:00
wjc404
54fc06fd70 Add files via upload 2019-11-02 10:06:13 +08:00
wjc404
1df9a2013d new sgemm kernel for skylakex 2019-11-02 00:00:48 +08:00
wjc404
274ff5cdb8 update sgemm_q on skylakex cpus 2019-11-01 23:59:18 +08:00
Martin Kroeker
eb2eddf241 Merge pull request #2296 from kdunee/develop
Fixed a minor cmake problem, occuring when DYNAMIC_ARCH=ON and CMAKE_C_FLAGS was empty
2019-10-28 13:24:18 +01:00
k.dunikowski
8691825944 Fixed a minor cmake problem, occuring when DYNAMIC_CORE=ON and CMAKE_C_FLAGS was empty 2019-10-28 08:51:05 +01:00
Martin Kroeker
7dc8a76f60 Merge pull request #2293 from martin-frbg/pr2288
Add support for NetBSD by adding it to the existing xBSD conditionals
2019-10-25 23:46:39 +02:00
Martin Kroeker
df857551c0 Remove special parameter set for obsolete IOS/ARMV8 workaround 2019-10-25 23:07:00 +02:00
Martin Kroeker
85ccdce8c4 Remove the IOS fallbacks to generic C kernels 2019-10-25 23:02:37 +02:00
Martin Kroeker
aeabe0a83f Fix regex to parse -R options with and without whitespace
Both forms are seen on NetBSD (#2288)
2019-10-25 22:52:30 +02:00
Martin Kroeker
1b90989662 Add NetBSD to the xBSD conditionals 2019-10-25 12:52:49 +02:00
Martin Kroeker
e3e8b5cdca Add NetBSD 2019-10-25 12:51:06 +02:00
Martin Kroeker
69b16a894d Merge pull request #2292 from martin-frbg/g95fixes
Improve support for g95 and non-GNU ld
2019-10-25 10:35:17 +02:00
Martin Kroeker
6782e5767d Merge pull request #2291 from martin-frbg/gensymbol
Fix netlib 3.7/3.8 function enumeration for linktest
2019-10-25 10:34:50 +02:00
Martin Kroeker
48f5a89f92 Merge pull request #2282 from martin-frbg/issue2281
Optimize RPCC function on ARM64
2019-10-25 09:56:30 +02:00
Martin Kroeker
4ae1610f37 Merge pull request #2290 from martin-frbg/cpuidfixes
Fixup x86 cpuid changes from #2283
2019-10-24 22:52:15 +02:00
Martin Kroeker
911c3e2f4b Improve support for g95 and non-GNU ld
Auto-add "-fno-second-underscore" option to make LAPACKE compile (as it calls LAPACK functions that may have gotten a second underscore added otherwise). Also support -R for rpath when parsing compiler directives in f_check
2019-10-24 22:43:27 +02:00
Martin Kroeker
fab49e49e5 Move most lapack 3.7/3.8 additions to the embedded_underscores list
to allow linktest to pass with a compiler that adds a second underscore to such names
2019-10-24 21:26:20 +02:00
Martin Kroeker
b687fba5bc Disable direct clock register access on IOS and Android
as I find conflicting information on accessibility from non-priviledged processes
2019-10-24 21:18:17 +02:00
luzpaz
46a8c2519a Remove prototype of unused, unimplemented function (#2274)
* Fix source typo

Found via `codespell -q 3 -L amin,als,ba,dum,mone,nd,nto,orign -S Changelog.txt,./lapack*`

* Remove beta-thread function per request
2019-10-24 18:56:53 +02:00
Martin Kroeker
e9437eebd2 Restore Goldmont ID and improve QEMU support
#2283 had inadvertently removed Goldmont+, and cpuid was reporting a mix of Core2 and Pentium2 for some QEMU configurations
2019-10-24 18:45:27 +02:00
Martin Kroeker
3a39062cfc Merge pull request #12 from xianyi/develop
resync with upstream
2019-10-24 18:40:13 +02:00
Martin Kroeker
eaa0be1313 Merge pull request #2286 from wjc404/develop
AVX512 DGEMM kernel
2019-10-20 12:44:19 +02:00
wjc404
6ff013bae0 native support for icopy_4
90% MKL 1-thread performance.
2019-10-19 03:54:44 +08:00
wjc404
0d669e04bb Update dgemm_kernel_8x8_skylakex.c 2019-10-18 15:00:17 +08:00
wjc404
17cdd9f9e1 some correction 2019-10-18 14:58:07 +08:00
wjc404
6bcb06fcb1 make further changes to icopy_8 easier 2019-10-18 10:47:31 +08:00
wjc404
b7315f8401 Add files via upload 2019-10-16 19:23:36 +08:00
wjc404
9b19e9e1b0 Update dgemm_kernel_8x8_skylakex.c 2019-10-16 10:14:51 +08:00
wjc404
6bd67ddbab Update dgemm_kernel_8x8_skylakex.c 2019-10-16 03:20:08 +08:00
wjc404
5da9484d93 Add files via upload 2019-10-16 02:01:13 +08:00
wjc404
844629af57 Add files via upload 2019-10-16 02:00:34 +08:00
Martin Kroeker
2beaa82c05 Merge pull request #2283 from martin-frbg/issue2176
Support QEMU virtual cpu in 64bit mode as CORE2 or BARCELONA
2019-10-09 22:06:09 +02:00
Martin Kroeker
e8a2aed2b9 Support QEMU cpu calling itself 64bit AMD Athlon as well
Some QEMU instances pretend to be "AuthenticAMD" with the same family 6/model 6 even when running on an Intel host
(could be related to qemu or libvirt version and/or kvm availability). Also fix the define to depend on __x86_64__ set by the
compiler, the defines using __64BIT__ will only work for getarch_2nd.
2019-10-09 18:24:13 +02:00
Martin Kroeker
f262031685 Support QEMU virtual cpu as CORE2
qemu itself claims it is a 64bit P6, which does not exist in the wild.
2019-10-08 22:30:02 +02:00
Martin Kroeker
5f6206fa2d Simplify OSX/IOS cross-compilation and add a CI test for it (#2279)
* Add automatic fixups for OSX/IOS cross-compilation

* Add OSX/IOS cross-compilation test to Travis CI

* Handle platforms that lack hwcap.h by falling back to ARMV8

* Fix PROLOGUE for OSX/IOS
2019-10-08 20:13:14 +02:00
Martin Kroeker
f2cde2ccfb Update common_arm64.h 2019-10-08 20:12:08 +02:00
Martin Kroeker
ba7838d2e1 Merge pull request #2280 from martin-frbg/iosfix
Add overlooked part of IOS compilation fix
2019-10-08 10:25:25 +02:00
Martin Kroeker
a448884a63 Remove automatic label postfixes from macro included only once 2019-10-08 08:37:50 +02:00
Martin Kroeker
17609f88f1 Merge pull request #11 from xianyi/develop
sync with upstream
2019-10-08 08:32:52 +02:00
Martin Kroeker
3a2df19db6 Fix accidental duplication of jump instruction 2019-10-08 08:09:26 +02:00
Martin Kroeker
d2093a40d3 Merge pull request #2277 from martin-frbg/issue2275
Rewrite ARMV8 code to allow cross-compilation for IOS
2019-10-06 23:01:54 +02:00
Martin Kroeker
aa04b0925e Merge pull request #2276 from xianyi/revert-2272-thread-sqrt-of-negative
Revert "Avoid taking root of negative number in symv_thread.c"
2019-10-06 11:12:44 +02:00
Martin Kroeker
258ac56e0a Move 32bit OSX build back to xcode 8.3 but switch to gcc8 2019-10-05 10:52:47 +02:00
Martin Kroeker
56837e9d92 Make local labels in macro compatible with the xcode assembler
... which does not perform the automatic numbering on instantiation that the _@ suffix signifies
2019-10-04 14:53:23 +02:00
Martin Kroeker
bb5413863f Rewrite ARM64 PROLOGUE to make it compatible with xcode/ios 2019-10-04 14:50:03 +02:00
Martin Kroeker
32f5907fef Update 32bit macOS again to xcode 9.3
os version 10.13 "High Sierra" appears to be the oldest release now for which Homebrew provides a gcc package.
Anything older and the Travis job will run out of time building gcc from source
2019-10-03 01:09:02 +02:00
Martin Kroeker
ac10236cc8 Update the OSX BINARY=32 test to xcode9.2
in response to Homebrew updates
2019-10-02 22:35:34 +02:00
Martin Kroeker
8617d75548 Revert "Avoid taking root of negative number in symv_thread.c" 2019-10-01 23:50:41 +02:00
Martin Kroeker
c07d78b9e9 Merge pull request #2272 from seberg/thread-sqrt-of-negative
Avoid taking root of negative number in symv_thread.c
2019-09-30 11:27:29 +02:00
Sebastian Berg
6355c25dde Avoid taking root of negative number in symv_thread.c
This is similar to fixes in gh-1929, but there was one remaining
occurance of this type of pattern in the driver/level2/*_thread.c
files.
2019-09-29 22:03:12 -07:00
Martin Kroeker
5e244d80f2 Merge pull request #2271 from quickwritereader/strmm_fix
fixed bug power9 strmm . BLAS-TESTER passes
2019-09-29 13:53:45 +02:00
AbdelRauf
ede5efebab trmm fix 2019-09-29 02:28:34 +00:00
Martin Kroeker
84908d60d2 Merge pull request #2269 from martin-frbg/ppc-fixes
Ppc fixes
2019-09-27 09:52:19 +02:00
Martin Kroeker
596a22325a Fix prologue of power9 assembly cdot(c) kernel to provide cdotc 2019-09-27 00:47:18 +02:00
Martin Kroeker
7f58f3ad0e Fix mis-edits in the gcc-derived power8 caxpy kernel 2019-09-27 00:44:26 +02:00
Martin Kroeker
c0d570a357 Merge pull request #7 from xianyi/develop
update
2019-09-27 00:42:32 +02:00
Martin Kroeker
6b83079368 Count cpu cores on ARMV8 and use that to pick the GEMM_PQ parameters (#2267)
There is currently no simple way to query cache sizes on ARMV8, so this takes the number of cores as a trivial indication if the target is a server-class device with a big cache, or just a single-board toy or smartphone.
2019-09-25 23:13:24 +02:00
Martin Kroeker
673e5a0495 Replace several POWER8/9 C kernels with their gcc7-generated assembly versions (#2263)
* Add gcc7-generated assembly files for POWER8/9 isa/ica-min/max and POWER9 caxpy

To work around internal compiler errors encountered when compiling the original C source with gcc 4 and 5, and wrong code generated by gcc 8.3.0

* Use gcc-generated assembly instead of original C sources

to work around internal compiler errors encountered with gcc 4.8/5.4 and wrong code generation by gcc 8.3

* Use gcc-generated assembly instead of the original C source

to work around internal compiler errors encountered with gcc 4.8 and 5.4, and wrong code generation by gcc 8.3

* Add gcc7-generated assembler version of caxpy for power8

to work around wrong code generated by gcc 8.3

* Handle CONJ define for caxpyc

* Handle CONJ define for caxpyc

* Add gcc7-generated assembly cdot for POWER9

* Use prebuilt assembly for POWER9 cdot

created with gcc 7.3.1 to work around ICE in older gcc versions

* Exclude POWER9 from DYNAMIC_ARCH when gcc versions is lower than 6

* Update Makefile.system

* Use PROLOGUE macro to ensure correct function name for DYNAMIC_ARCH

* Disable POWER9 with old gcc versions
2019-09-22 22:35:22 +02:00
Martin Kroeker
bfa2cc7d64 Restore ppc64 CI job and remove the travis_wait that caused the problem with it 2019-09-20 10:29:35 +02:00
Martin Kroeker
e7c4d6705a Revert #2051 and replace with a better fix (#2261)
* Revert #2051 and add a better fix for TARGET=generic with DYNAMIC_ARCH
fixes #2257 without breaking #2048 again
2019-09-17 18:56:04 +02:00
Martin Kroeker
2a1911cc14 Merge pull request #6 from xianyi/develop
update to current develop
2019-09-13 14:00:23 +02:00
Martin Kroeker
9f7a9a32e3 Merge pull request #2252 from thrasibule/trtrs
Optimized ?trtrs
2019-09-12 21:45:47 +02:00
Guillaume Horel
2463938879 fix error message 2019-09-11 10:35:25 -04:00
Guillaume Horel
5d6525c87c more bugfix 2019-09-10 17:30:57 -04:00
Guillaume Horel
6cb47ea3f0 fix Makefile 2019-09-10 17:11:01 -04:00
Guillaume Horel
459bb9291d fix error codes 2019-09-10 17:10:33 -04:00
Martin Kroeker
3f1077ce6f Merge pull request #2249 from brada4/gcc7minor
Address minor warnings popping up in gcc7+
2019-09-10 08:27:32 +02:00
Martin Kroeker
eb45eb6942 Fix C compiler handling and BINARY=32 mode in CMAKE builds (#2248)
* Fix compiler identification and option setting

* Handle BINARY=32 option on X86_64

* Add xGEMM3M unroll parameters for crossbuild-target CORE2

* Replace bogus mingw64/32bit CI job with actual 32bit build

mingw64 is not multilib-capable, so using an x86_64-mingw with BINARY=32 in the CI was not going to work anyway (but build passed while BINARY=32 was ignored).
2019-09-10 08:27:06 +02:00
Guillaume Horel
f2becb777a fix Makefile 2019-09-09 11:36:50 -04:00
Guillaume Horel
5997b6b491 bugfix 2019-09-08 11:14:49 -04:00
Guillaume Horel
4b21b646ea turn on optimized code 2019-09-08 11:14:49 -04:00
Guillaume Horel
7ec7b999a5 add missing file 2019-09-08 11:14:49 -04:00
Guillaume Horel
af9ac0898a fix Makefile 2019-09-08 11:14:49 -04:00
Guillaume Horel
c7b5a459b6 add missing defines and headers 2019-09-08 11:14:49 -04:00
Guillaume Horel
9b2f0323d6 update Makefile 2019-09-08 11:14:49 -04:00
Guillaume Horel
9f6984fe4b add missing files 2019-09-08 11:14:49 -04:00
Guillaume Horel
42203dafdc add logic 2019-09-08 11:14:49 -04:00
Guillaume Horel
a4f17a9297 add missing objects 2019-09-08 11:14:49 -04:00
Guillaume Horel
733d97b2df add files 2019-09-08 11:14:49 -04:00
Guillaume Horel
ea747cf933 start working on ?trtrs 2019-09-08 11:14:49 -04:00
Andrew
4de545aa7d address minor warnings from gcc7 2019-09-07 10:21:08 +03:00
Andrew
6e9a93ec19 init 2019-09-07 10:18:46 +03:00
Martin Kroeker
fde8a8e6a0 Improve cmake build behaviour with non-host cpu targets (#2246)
1. Supply appropriate values for C/Z GEMM unroll when cross-compiling for CORE2 or ARMV7
2. Add the required xLOCAL_BUFFER_SIZE parameters for cross-compiling CORE2
3. Add -DFORCE_<target> option to getarch when building with -DTARGET=target
for #2245
2019-09-03 22:41:17 +02:00
Martin Kroeker
256fc15f5f Merge pull request #2 from xianyi/develop
update
2019-09-03 15:12:14 +02:00
Martin Kroeker
ee498525e0 Merge pull request #2242 from martin-frbg/issue2235
Add arch data for cmake cross-compiling to CORE2
2019-09-02 22:06:29 +02:00
Martin Kroeker
1fec0570f6 Add cgemm and zgemm unroll factors for core2 2019-09-02 15:03:45 +02:00
Martin Kroeker
b5af7b9c78 Disable ppc64le test environment on Travis CI
as this semi-official beta option has suddenly reverted to a standard x86_64 environment causing spurious failures
2019-08-31 18:06:12 +02:00
Martin Kroeker
f3c314550c Merge pull request #2243 from quickwritereader/develop
possible cgemv,caxpy,cdot fix
2019-08-30 23:06:23 +02:00
AbdelRauf
847c20c9b7 fix uninitialized variables i 2019-08-30 11:14:55 +00:00
AbdelRauf
4c22828812 caxpy and cdot are using vec_vsx_ld 2019-08-30 04:09:15 +00:00
AbdelRauf
e79712d969 cgemv using vec_vsx_ld instead of letting gcc to decide 2019-08-30 02:52:04 +00:00
AbdelRauf
be09551cdf aligned 2019-08-29 23:22:23 +00:00
Martin Kroeker
ec1ef6aa9e Merge pull request #2241 from martin-frbg/zdotfix
Make x86_64 zdot compile with PGI and Sun C again
2019-08-29 07:12:54 +02:00
Martin Kroeker
11c59acfb1 Keep both PGI/SUN and default code paths to avoid breaking Clang/WIndows 2019-08-28 18:07:44 +02:00
Martin Kroeker
bf0d92a310 Add arch data for cross-compiling to CORE2
for #2235
2019-08-28 17:35:56 +02:00
Martin Kroeker
db066151ee Merge pull request #2240 from martin-frbg/issue2237
Fix PGI build options (again)
2019-08-28 15:30:53 +02:00
Martin Kroeker
3a55dca2dc Make x86_64 zdot compile with PGI and Sun C again
broken by #2222 as CREAL,CIMAG do not expand to a valid lvalue with these compilers
2019-08-28 11:35:31 +02:00
Martin Kroeker
7d380f7d79 Fix PGI build options (again)
for #2237
2019-08-28 11:31:20 +02:00
Martin Kroeker
300f158d3b Merge pull request #2239 from martin-frbg/issue2231
Fix 32bit armv8 compilation regression
2019-08-28 07:54:57 +02:00
Martin Kroeker
3635fdbf2b Do not abuse the global ARCH variable as a local temporary
Setting it with a simple "uname -m" just to be able to decide whether to compile getarch.c with -march=native
may actually keep getarch from doing a proper probe. Fixes #2231, a regression caused by #2110
2019-08-27 22:52:17 +02:00
Martin Kroeker
b6552b11eb Merge pull request #2 from xianyi/develop
merge develop
2019-08-27 22:41:31 +02:00
Kavana Bhat
3dc6b26eff AIX changes for Power8 2019-08-20 06:51:35 -05:00
Martin Kroeker
5fdf9ad24f Merge pull request #2228 from martin-frbg/issue2227
Add Intel Goldmont Plus CPUID
2019-08-19 18:26:51 +02:00
Martin Kroeker
2fe967c542 Merge branch 'develop' into issue2227 2019-08-19 14:20:39 +02:00
Martin Kroeker
6d8595351c Add Intel Goldmont Plus CPUID
fixes #2227
2019-08-19 14:19:21 +02:00
Martin Kroeker
f40200f559 Merge pull request #2223 from martin-frbg/getarch-pgi
Make getarch compile with PGI
2019-08-16 12:21:30 +02:00
Martin Kroeker
a95a5e52b8 Fix PGI compiler detection for getarch 2019-08-16 09:00:11 +02:00
Martin Kroeker
e3d846ab57 Do not use -march=native with the PGI compiler 2019-08-16 08:58:10 +02:00
Martin Kroeker
8506386d82 Merge pull request #1 from xianyi/develop
rebase
2019-08-16 08:56:15 +02:00
Martin Kroeker
9ef96b32a6 Add multithreading support to the x86_64 zdot kernel (#2222)
* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
2019-08-15 22:09:12 +02:00
Martin Kroeker
b48c025974 Merge pull request #2218 from martin-frbg/issue2215
Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
2019-08-14 07:32:31 +02:00
Martin Kroeker
a1fce67743 Make the new DGEMM regression test properly depend on CBLAS and LAPACKE
fixes #2215
2019-08-13 22:29:48 +02:00
Martin Kroeker
103b32fdb7 Merge pull request #2216 from martin-frbg/issue2214
Remove case-sensitivity in x86 LSAME on (AMD) cpus without CMOV
2019-08-13 13:59:33 +02:00
Martin Kroeker
aef9804089 Fix unwanted case-sensitivity in x86 LSAME for (AMD) processors without CMOV
Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches.
Fixes #2214
2019-08-13 10:19:10 +02:00
Martin Kroeker
303869f572 Update with changes from 0.3.7 2019-08-11 23:31:36 +02:00
Martin Kroeker
02d9203981 Increment version to 0.3.8.dev 2019-08-11 23:28:47 +02:00
Martin Kroeker
7b6808b69c Increment version to 0.3.8.dev 2019-08-11 23:28:13 +02:00
1476 changed files with 108567 additions and 21113 deletions

View File

@@ -8,7 +8,7 @@ platform:
steps:
- name: Build and Test
image: ubuntu:19.04
image: ubuntu:18.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32'
@@ -32,7 +32,7 @@ platform:
steps:
- name: Build and Test
image: ubuntu:19.04
image: ubuntu:18.04
environment:
CC: gcc
COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32'
@@ -92,7 +92,7 @@ steps:
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
- ctest -V
---
kind: pipeline
@@ -116,7 +116,7 @@ steps:
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
- ctest -V
---
kind: pipeline
@@ -140,4 +140,53 @@ steps:
- mkdir build && cd build
- cmake $CMAKE_FLAGS ..
- make -j
- ctest
- ctest -V
---
kind: pipeline
name: arm64_native_test
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
COMMON_FLAGS: 'USE_OPENMP=1'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl python g++
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
- make -C cpp_thread_test dgemm_tester
---
kind: pipeline
name: epyc_native_test
platform:
os: linux
arch: amd64
steps:
- name: Build and Test
image: ubuntu:18.04
environment:
CC: gcc
COMMON_FLAGS: 'USE_OPENMP=1'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran perl python g++
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C test $COMMON_FLAGS
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
- make -C cpp_thread_test dgemm_tester

103
.github/workflows/dynamic_arch.yml vendored Normal file
View File

@@ -0,0 +1,103 @@
name: continuous build
on: [push, pull_request]
jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest]
fortran: [gfortran, flang]
build: [cmake, make]
steps:
- name: Checkout repository
uses: actions/checkout@v2
- name: Compilation cache
uses: actions/cache@v2
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ${{ runner.os }}-ccache-${{ github.sha }}
# Restore any ccache cache entry, if none for
# ${{ runner.os }}-ccache-${{ github.sha }} exists
restore-keys: |
${{ runner.os }}-ccache-
- name: Print system information
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
cat /proc/cpuinfo
elif [ "$RUNNER_OS" == "macOS" ]; then
sysctl -a | grep machdep.cpu
else
echo "$RUNNER_OS not supported"
exit 1
fi
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache
elif [ "$RUNNER_OS" == "macOS" ]; then
brew install coreutils cmake ccache
else
echo "$RUNNER_OS not supported"
exit 1
fi
ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB
- name: gfortran build
if: matrix.build == 'make' && matrix.fortran == 'gfortran'
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
else
echo "$RUNNER_OS not supported"
exit 1
fi
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0
- name: flang build
if: matrix.build == 'make' && matrix.fortran == 'flang'
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
exit 0
else
echo "$RUNNER_OS not supported"
exit 1
fi
cd /usr/
sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz
sudo tar xf flang-20190329-x86-70.tgz
sudo rm flang-20190329-x86-70.tgz
cd -
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang
- name: CMake gfortran build
if: matrix.build == 'cmake' && matrix.fortran == 'gfortran'
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
export PATH="/usr/lib/ccache:${PATH}"
elif [ "$RUNNER_OS" == "macOS" ]; then
export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}"
else
echo "$RUNNER_OS not supported"
exit 1
fi
mkdir build
cd build
cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release ..
make -j$(nproc)

View File

@@ -0,0 +1,79 @@
# Only the "head" branch of the OpenBLAS package is tested
on:
push:
paths:
- '**/nightly-Homebrew-build.yml'
pull_request:
branches:
- develop
paths:
- '**/nightly-Homebrew-build.yml'
schedule:
- cron: 45 7 * * *
# This is 7:45 AM UTC daily, late at night in the USA
# Since push and pull_request will still always be building and testing the `develop` branch,
# it only makes sense to test if this file has been changed
name: Nightly-Homebrew-Build
jobs:
build-OpenBLAS-with-Homebrew:
runs-on: macos-latest
env:
DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer
HOMEBREW_DEVELOPER: "ON"
HOMEBREW_DISPLAY_INSTALL_TIMES: "ON"
HOMEBREW_NO_ANALYTICS: "ON"
HOMEBREW_NO_AUTO_UPDATE: "ON"
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
steps:
- name: Random delay for cron job
run: |
delay=$(( RANDOM % 600 ))
printf 'Delaying for %s seconds on event %s' ${delay} "${{ github.event_name }}"
sleep ${delay}
if: github.event_name == 'schedule'
- uses: actions/checkout@v2
# This isn't even needed, technically. Homebrew will get `develop` via git
- name: Update Homebrew
if: github.event_name != 'pull_request'
run: brew update || true
- name: Install prerequisites
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
- name: Install and bottle OpenBLAS
run: brew install --fetch-HEAD --HEAD --build-bottle --keep-tmp openblas
# the HEAD flags tell Homebrew to build the develop branch fetch via git
- name: Create bottle
run: |
brew bottle -v openblas
mkdir bottles
mv *.bottle.tar.gz bottles
- name: Upload bottle
uses: actions/upload-artifact@v1
with:
name: openblas--HEAD.catalina.bottle.tar.gz
path: bottles
- name: Show linkage
run: brew linkage -v openblas
- name: Test openblas
run: brew test --HEAD --verbose openblas
- name: Audit openblas formula
run: |
brew audit --strict openblas
brew cat openblas
- name: Post logs on failure
if: failure()
run: brew gist-logs --with-hostname -v openblas

5
.gitignore vendored
View File

@@ -70,6 +70,7 @@ test/SBLAT2.SUMM
test/SBLAT3.SUMM
test/ZBLAT2.SUMM
test/ZBLAT3.SUMM
test/SHBLAT3.SUMM
test/cblat1
test/cblat2
test/cblat3
@@ -79,6 +80,7 @@ test/dblat3
test/sblat1
test/sblat2
test/sblat3
test/test_shgemm
test/zblat1
test/zblat2
test/zblat3
@@ -87,4 +89,5 @@ build.*
*.swp
benchmark/*.goto
benchmark/smallscaling
CMakeCache.txt
CMakeFiles/*

View File

@@ -16,8 +16,7 @@ matrix:
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
script:
- set -e
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@@ -34,6 +33,28 @@ matrix:
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
os: linux
arch: s390x
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=IBMZ_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
os: linux
dist: focal
arch: s390x
compiler: clang
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=Z13 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=IBMZ_LINUX
- BTYPE="BINARY=64 USE_OPENMP=0 CC=clang"
- <<: *test-ubuntu
env:
- TARGET_BOX=LINUX64
@@ -66,6 +87,40 @@ matrix:
- TARGET_BOX=LINUX32
- BTYPE="BINARY=32"
- os: linux
arch: ppc64le
dist: bionic
compiler: gcc
before_script:
- sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX_P9
- os: linux
arch: ppc64le
dist: bionic
compiler: gcc
before_script:
- sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX_P9
- os: linux
compiler: gcc
addons:
@@ -98,7 +153,6 @@ matrix:
- sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
before_script: *common-before
script:
- set -e
# XXX: Disable some warnings for now to avoid exceeding Travis limit for log size.
- alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types"
@@ -141,7 +195,6 @@ matrix:
before_script:
- COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32"
script:
- set -e
- mkdir build
- CONFIG=Release
- cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG
@@ -160,19 +213,58 @@ matrix:
os: osx
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
- brew install gcc@8 # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
- <<: *test-macos
osx_image: xcode8.3
osx_image: xcode12
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
- brew update
- brew install gcc@10 # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=32"
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
- <<: *test-macos
osx_image: xcode10.0
env:
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode10.1
env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
- <<: *test-macos
osx_image: xcode10.1
env:
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
- CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
- BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
- &test-graviton2
os: linux
arch: arm64-graviton2
dist: focal
group: edge
virt: lxd
compiler: gcc
addons:
apt:
packages:
- gfortran
script:
- travis_wait 45 make && make lapack-test
# whitelist
branches:
only:

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 7)
set(OpenBLAS_PATCH_VERSION 11)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@@ -23,11 +23,14 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF)
option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
set(NO_AFFINITY 1)
endif()
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
# Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using
@@ -86,9 +89,13 @@ if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
endif ()
if (NOT DEFINED BUILD_BFLOAT16)
set (BUILD_BFLOAT16 false)
endif ()
# set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all
# set(BUILD_BFLOAT16 true)
set(BUILD_SINGLE true)
set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true)
@@ -120,6 +127,11 @@ if (BUILD_COMPLEX16)
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
endif ()
if (BUILD_BFLOAT16)
message(STATUS "Building Half Precision")
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
endif ()
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.")
endif ()
@@ -223,6 +235,10 @@ if (NOT MSVC AND NOT NOFORTRAN)
if(NOT NO_CBLAS)
add_subdirectory(ctest)
endif()
add_subdirectory(lapack-netlib/TESTING)
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()
endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
@@ -234,11 +250,11 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE")
endif()
endif()
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
else()
@@ -347,10 +363,21 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
if (NOT ${SYMBOLPREFIX} STREQUAL "")
string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
if (NOT ${SYMBOLSUFFIX} STREQUAL "")
string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}")
string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
endif()
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
@@ -367,11 +394,9 @@ if(NOT NO_LAPACKE)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64})
endif()
include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif()
# Install pkg-config files
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
# GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share".

View File

@@ -171,3 +171,23 @@ In chronological order:
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel
* Jiachen Wang <https://github.com/wjc404>
* [2019-07-29] optimize AVX2 DGEMM
* [2019-10-20] AVX512 DGEMM kernel (4x8)
* [2019-11-06] optimize AVX512 SGEMM
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
* [2020-01-07] optimize AVX2 SGEMM and STRMM
* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
* [2020-04-15] Half-precision GEMM for bfloat16
* Marius Hillenbrand <https://github.com/mhillenibm>
* [2020-05-12] Revise dynamic architecture detection for IBM z
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
* [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support
* Danfeng Zhang <https://github.com/craft-zhang>
* [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53

View File

@@ -1,46 +1,289 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.11
17-Oct-2020
common:
* API change:
the newly added BFLOAT16 functions were renamed to use the
letter "B" instead of "H" to avoid potential confusion with
the IEEE "half precision float" type, i.e. the 0.3.10
SHGEMM is now SBGEMM and the corresponding build option
was changed from "BUILD_HALF" to "BUILD_BFLOAT16".
* Reduced the default BLAS3_MEM_ALLOC_THRESHOLD (used as an upper
limit for placing temporary arrays on the stack) to be compatible
with a stack size of 1mb (as imposed by the JAVA runtime library)
* Added mixed-precision dot function SBDOT and utility functions
shstobf16, shdtobf16, sbf16tos and dbf16tod to convert between
single or double precision float arrays and bfloat16 arrays
* Fixed prototypes of LAPACK_?ggsvp and LAPACK_?ggsvd functions
in lapack.h
* Fixed underflow and rounding errors in LAPACK SLANV2 and DLANV2
(causing miscalculations in e.g. SHSEQR/DHSEQR, LAPACK issue #263)
* Fixed workspace calculation in LAPACK ?GELQ (LAPACK issue #415)
* Fixed several bugs in the LAPACK testsuite
* Improved performance of TRMM and TRSM for certain problem sizes
* Fixed infinite recursions and workspace miscalculations in ReLAPACK
* CMAKE builds no longer require pkg-config for creating the .pc file
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
enabling these options
* Fixed detection of gfortran when invoked through an mpi wrapper
* Improve thread reinitialization performance with OpenMP xafter a fork
* Added support for building only the subset of the library required
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
* Optional function name prefixes and suffixes are now correctly
reflected in the generated cblas.h
* Added CMAKE build support for the LAPACK and multithreading tests
POWER:
* Added optimized support for POWER10
* Added support for compiling for POWER8 in 32bit mode
* Added support for compilation with LLVM/clang
* Added support for compilation with NVIDIA/PGI compilers
* Fixed building on big-endian POWER8
* Fixed miscompilation of ZDOTC by gcc10
* Fixed alignment errors in the POWER8 SAXPY kernel
* Improved CPU detection on AIX
* Supported building with older compilers on POWER9
x86_64:
* Added support for Intel Cooperlake
* Added autodetection of AMD Renoir/Matisse/Zen3 cpus
* Added autodetection of Intel Comet Lake cpus
* Reimplemented ?sum, ?dot and daxpy using universal intrinsics
* Reset the fpu state before using the fpu on Windows as a workaround
for a problem introduced in Windows 10 build 19041 (a.k.a. SDK 2004)
* Fixed potentially undefined behaviour in the dot and gemv_t kernels
* Fixed a potential segmentation fault in DYNAMIC_ARCH builds
* Fixed building for ZEN with PGI/NVIDIA and AMD AOCC compilers
ARMV7:
* Fixed cpu detection on BSD-like systems
ARMV8:
* Added preliminary support for Apple Vortex cpus
* Added support for the Cavium ThunderX3T110 cpu
* Fixed cpu detection on BSD-like systems
* Fixed compilation in -std=C18 mode
IBM Z:
* Added support for compiling with the clang compiler
* Improved GEMM performance on Z14
====================================================================
Version 0.3.10
14-Jun-2020
common:
* Improved thread locking behaviour in blas_server and parallel getrf
* Imported bugfix 394 from LAPACK (spurious reference to "XERBL"
due to overlong lines)
* Imported bugfix 403 from LAPACK (compile option "recursive" required
for correctness with Intel and PGI)
* Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB)
* Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP)
* Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that
could lead to crashes at large matrix sizes
* Restored internal soname in dynamic libraries on FreeBSD and Dragonfly
* Added API (openblas_setaffinity) to set the thread affinity on Linux
* Added initial infrastructure for half-precision floating point
(bfloat16) support with a generic implementation of SHGEMM
* Added CMAKE build system support for building the cblas_Xgemm3m
functions
* Fixed CMAKE support for building in a path with embedded spaces
* Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC
* Fixed GCC version detection in the Makefiles
* Allowed overriding the names of AR, AS and LD in Makefile builds
POWER:
* Fixed big-endian POWER8 ELFv2 builds on FreeBSD
* Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9
* Fixed CMAKE build support for POWER9
* fixed a potential race condition in the thread buffer allocation
* Worked around LAPACK test failures on PPC G4
MIPS:
* Fixed a potential race condition in the thread buffer allocation
* Added support for MIPS 24K/24KE family based on P5600 kernels
MIPS64:
* fixed a potential race condition in the thread buffer allocation
* Added TARGET=GENERIC
ARMV7:
* Fixed a race condition in the thread buffer allocation
ARMV8:
* Fixed a race condition in the thread buffer allocation
* Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA
* Improved performance of the ThunderX2 DAXPY kernel
* Added an optimized SGEMM kernel for Cortex A53
* Fixed Makefile support for INTERFACE64 (8-byte integer)
x86_64:
* Fixed a syntax error in the CMAKE setup for SkylakeX
* Improved performance of STRSM on Haswell, SkylakeX and Ryzen
* Improved SGEMM performance on SGEMM for workloads with ldc a
multiple of 1024
* Improved DGEMM performance on Skylake X
* Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH
builds created on SkylakeX
* Removed data alignment requirement in the SSE2 copy kernels
that could cause spurious crashes
* Added a workaround for an optimizer bug in AppleClang 11.0.3
* Fixed LAPACK test failures due to wrong options for Intel Fortran
* Fixed compilation and LAPACK test results with recent Flang
and AMD AOCC
* Fixed DYNAMIC_ARCH builds with CMAKE on OS X
* Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max,
cblas_?sum, cblas_?gemm3m in the shared library on OS
* Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes
show the name of an older generation chip supported by the same kernels)
IBM Z:
* Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14
====================================================================
Version 0.3.9
1-Mar-2020
common:
* Fixed a miscompilation of the GETRF functions with CMAKE
* Imported bugfix 390 from LAPACK (missing NaN propagation in xCOMBSSQ)
* The size of the memory buffer used for splitting GEMM tasks across
multiple threads can now be configured in the build system.
POWER:
* Fixed several compilation problems related to endianness
and ELF version on POWER8 and POWER9
* Fixed use of the absolute value IAMIN/IAMAX instead of IMIN/IMAX
* Fixed a race condition in the level3 blas code
MIPS64:
* Fixed use of the absoltute value IAMIN/IAMAX instead of IMIN/IMAX
ARMV7:
* Fixed a race condition in the level3 blas code
* Fixed compilation on Android
ARMV8:
* Added support for Ampere EMAG8180
* Added support for Neoverse N1
* Improved performance of the blas_lock function
* Fixed a race condition in the level3 blas code
* Fixed a performance regression on TSV110-based servers
x86_64:
* Fixed a long-standing error with undeclared register overwrites
in the DSCAL microkernel for HASWELL,SKYLAKEX and ZEN
* Fixed a long-standing bug in the SSE implementation of IAMAX
* Fixed a CMAKE build failure with DYNAMIC_ARCH
* Fixed cpu autodetection of Goldmont+, Cannon Lake and Ice Lake
* Fixed a compilation failure on OSX with compiler name containing dash
* Fixed compilation with MinGW on SkylakeX
* Improved speed of the AVX512 GEMM3M kernel on SkylakeX
* Added an AVX512 STRMM kernel for SkylakeX
* Improved GEMM performance on Haswell and Zen
zarch:
* fixed compilation of the DYNAMIC_ARCH code
====================================================================
Version 0.3.8
9-Feb-2020
common:
` * LAPACK has been updated to 3.9.0 (plus patches up to
January 2nd, 2020)
* CMAKE support has been improved in several areas including
cross-compilation
* a thread race condition in the GEMM3M kernels was resolved
* the "generic" (plain C) gemm beta kernel used by many targets
has been sped up
* an optimized version of the LAPACK trtrs functions has been added
* an incompatibilty between the LAPACK tests and the OpenBLAS
implementation of XERBLA was resolved, removing the numerous
warnings about wrong error exits in the former
* support for NetBSD has been added
* support for compilation with g95 and non-GNU versions of ld
has been improved
* support for compilation with (upcoming) gcc 10 has been added
POWER:
* worked around miscompilation of several POWER8 and POWER9
kernels by older versions of gcc
* added support for big-endian POWER8 and for compilation on AIX
* corrected bugs in the big-endian support for PPC440 and PPC970
* DYNAMIC_ARCH support is now available in CMAKE builds as well
ARMV8:
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
* compilation for 32bit works again
* performance of the RPCC function has been improved
* improved performance on small systems
* DYNAMIC_ARCH support is now available in CMAKE builds as well
* cross-compilation from OSX to IOS was simplified
x86_64:
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
was significantly improved
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
* added support for QEMU virtual cpus
* a compilation problem with PGI and SUN compilers was fixed
* Intel "Goldmont plus" is now autodetected
* a potential crash on program exit on MS Windows has been fixed
x86:
* an unwanted case sensitivity in the implementation of LSAME
on older 32bit AMD cpus was fixed
zarch:
* Z15 is now supported as Z14
* DYNAMIC_ARCH is now available on ZARCH as well
====================================================================
Version 0.3.7
11-Aug 2019
common:
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
* having the gmake special variables TARGET_ARCH or TARGET_MACH
defined no longer causes build failures in ctest or utest
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
has the same effect as setting them to 1
* a new test program was added to allow checking the library for
thread safety
* a new option USE_LOCKING was added to ensure thread safety when
OpenBLAS itself is built without multithreading but will be
called from multiple threads.
* a build failure on Linux with glibc versions earlier than 2.5
was fixed
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
on glibc 2.6 was fixed
* NO_AFFINITY was added to the CMAKE options (and defaults to being
active on Linux, as in the gmake builds)
x86_64:
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
* the build-time logic for detection of AVX512 availability in
the processor and compiler was fixed
* gmake builds on OSX now set the internal name of the library to
libopenblas.0.dylib (consistent with CMAKE)
* the Haswell DGEMM kernel received a significant speedup through
improved prefetch and load instructions
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
increased by avoiding vpermpd instructions
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
to fix remaining errors in DGEMM, DSYMM and DTRMM
## POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 single and double precision complex BLAS3
* added optimized kernels for POWER9 SGEMM and STRMM
POWER:
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
* added optimized kernels for POWER9 SGEMM and STRMM
## ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
ARMV7:
* fixed the softfp implementations of xAMAX and IxAMAX
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
they were appropriate for only a subset of platforms
====================================================================
Version 0.3.6

9
Jenkinsfile vendored Normal file
View File

@@ -0,0 +1,9 @@
node {
stage('Checkout') {
checkout
}
stage('Build') {
sh("make")
}
}

View File

@@ -56,10 +56,21 @@ ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
@$(CC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \
cverinfo=`$(CC) --version | sed -n '1p'`; \
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
else \
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
fi
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
@$(FC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \
fverinfo=`$(FC) --version | sed -n '1p'`; \
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
else \
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
fi
endif
ifneq ($(OSNAME), AIX)
@echo -n " Library Name ... $(LIBNAME)"
@@ -68,9 +79,13 @@ else
endif
ifndef SMP
@echo " (Single threaded) "
@echo " (Single-threading) "
else
@echo " (Multi threaded; Max num-threads is $(NUM_THREADS))"
@echo " (Multi-threading; Max num-threads is $(NUM_THREADS))"
endif
ifeq ($(DYNAMIC_ARCH), 1)
@echo " Supporting multiple $(ARCH) cpu models with minimum requirement for the common code being $(CORE)"
endif
ifeq ($(USE_OPENMP), 1)
@@ -97,12 +112,12 @@ endif
shared :
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
@@ -126,7 +141,7 @@ ifndef NO_FBLAS
$(MAKE) -C test all
endif
$(MAKE) -C utest all
ifndef NO_CBLAS
ifneq ($(NO_CBLAS), 1)
$(MAKE) -C ctest all
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
$(MAKE) -C cpp_thread_test all
@@ -229,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib
endif
ifndef NO_LAPACKE
ifneq ($(NO_LAPACKE), 1)
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif
endif
@@ -247,21 +262,22 @@ prof_lapack : lapack_prebuild
lapack_prebuild :
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -285,6 +301,18 @@ else
endif
ifeq ($(BUILD_LAPACK_DEPRECATED), 1)
-@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_SINGLE), 1)
-@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_DOUBLE), 1)
-@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_COMPLEX), 1)
-@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
ifeq ($(BUILD_COMPLEX16), 1)
-@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -317,9 +345,9 @@ lapack-test :
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
( cd $(NETLIB_LAPACK_DIR)/INSTALL; $(MAKE) all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
endif
lapack-runtest:
@@ -349,11 +377,12 @@ clean ::
@$(MAKE) -C kernel clean
#endif
@$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0
ifeq ($(OSNAME), Darwin)
@rm -rf getarch.dSYM getarch_2nd.dSYM
endif
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@rm -f cblas.tmp cblas.tmp2
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h

View File

@@ -1,7 +1,7 @@
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a

View File

@@ -24,6 +24,23 @@ CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
ifeq ($(CORE), NEOVERSEN1)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq ($(GCCVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
@@ -39,7 +56,25 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
ifeq ($(CORE), THUNDERX3T110)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
else
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
endif
ifeq ($(CORE), VORTEX)
CCOMMON_OPT += -march=armv8.3-a
FCOMMON_OPT += -march=armv8.3-a
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif

View File

@@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
PKG_EXTRALIB := $(EXTRALIB)
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), PGI)
PKG_EXTRALIB += -lomp
else
PKG_EXTRALIB += -lgomp
endif
endif
.PHONY : install
.NOTPARALLEL : install
@@ -45,12 +53,28 @@ install : lib.grd
ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
@cp cblas.h cblas.tmp
ifdef SYMBOLPREFIX
@sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2
@sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
#change back any openblas_complex_float and double that got hit
@sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2
@sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp
endif
ifdef SYMBOLSUFFIX
@sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2
@sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
#change back any openblas_complex_float and double that got hit
@sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2
@sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp
endif
@sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@@ -67,21 +91,21 @@ endif
#for install shared library
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
ifeq ($(OSNAME), $(filter $(OSNAME),OpenBSD NetBSD))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@-install_name_tool -id "$(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(MAJOR_VERSION).dylib" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
@@ -100,6 +124,7 @@ else
#install on AIX has different options syntax
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@@ -130,7 +155,7 @@ endif
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@@ -166,4 +191,3 @@ endif
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!

View File

@@ -9,23 +9,71 @@ else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER10)
COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif
ifeq ($(CORE), POWER9)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mvsx -fno-fast-math
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
CCOMMON_OPT += -mcpu=power8 -mtune=power8
else
COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math
CCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
CCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -O2 -frecursive -fno-fast-math
ifeq ($(C_COMPILER), GCC)
ifneq ($(GCCVERSIONGT4), 1)
$(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended)
FCOMMON_OPT += -mcpu=power8 -mtune=power8
else
FCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
FCOMMON_OPT += -mcpu=power9 -mtune=power9
endif
else
FCOMMON_OPT += -O2 -Mrecursive
endif
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math
else
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
CCOMMON_OPT += -fast -Mvect=simd -Mcache_align
endif
ifneq ($(F_COMPILER), PGI)
ifeq ($(OSNAME), AIX)
FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
else
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math
endif
else
FCOMMON_OPT += -O2 -Mrecursive
endif
endif
ifeq ($(USE_OPENMP), 1)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -DUSE_OPENMP -fopenmp
else
CCOMMON_OPT += -DUSE_OPENMP -mp
endif
ifneq ($(F_COMPILER), PGI)
FCOMMON_OPT += -DUSE_OPENMP -fopenmp
else
FCOMMON_OPT += -DUSE_OPENMP -mp
endif
endif
@@ -68,6 +116,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64
ifeq ($(COMPILER_F77), g77)
FCOMMON_OPT += -mpowerpc64 -maix64
endif
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -mpowerpc64 -maix64
endif
ifeq ($(COMPILER_F77), xlf)
FCOMMON_OPT += -q64
endif

View File

@@ -17,7 +17,11 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
ifeq ($(TARGET), 1004K)
ifeq ($(TARGET), MIPS24K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), MIPS1004K)
TARGET_FLAGS = -mips32r2
endif
@@ -42,7 +46,7 @@ all: getarch_2nd
./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
ifneq ($(ONLY_CBLAS), 1)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else
@@ -59,13 +63,13 @@ endif
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
$(HOSTCC) $(CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
getarch_2nd : getarch_2nd.c config.h dummy
ifndef TARGET_CORE
$(HOSTCC) -I. $(CFLAGS) -o $(@F) getarch_2nd.c
$(HOSTCC) -I. $(HOST_CFLAGS) -o $(@F) getarch_2nd.c
else
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
$(HOSTCC) -I. $(HOST_CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif
dummy:

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.7
VERSION = 0.3.11
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -97,6 +97,15 @@ VERSION = 0.3.7
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# When multithreading, OpenBLAS needs to use a memory buffer for communicating
# and collating results for individual subranges of the original matrix. Since
# the original GotoBLAS of the early 2000s, the default size of this buffer has
# been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
# If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
# this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
# manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
# BUFFERSIZE = 25
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1
@@ -263,7 +272,33 @@ COMMON_PROF = -pg
# work at all.
#
# CPP_THREAD_SAFETY_TEST = 1
#
# use this to run only the less memory-hungry GEMV test
# CPP_THREAD_SAFETY_GEMV = 1
# If you want to enable the experimental BFLOAT16 support
# BUILD_BFLOAT16 = 1
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
# will be allocated on the heap rather than the stack. (This array alone requires
# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
# counts, but obviously it is not the only item that ends up on the stack.
# The default value of 32 ensures that the overall requirement is compatible
# with the default 1MB stacksize imposed by having the Java VM loaded without use
# of its -Xss parameter.
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
# BLAS3_MEM_ALLOC_THRESHOLD = 160
# the below is not yet configurable, use cmake if you need to build only select types
BUILD_SINGLE = 1
BUILD_DOUBLE = 1
BUILD_COMPLEX = 1
BUILD_COMPLEX16 = 1
# End of user configuration
#

View File

@@ -9,9 +9,11 @@ ifndef TOPDIR
TOPDIR = .
endif
# If ARCH is not set, we use the host system's architecture.
# If ARCH is not set, we use the host system's architecture for getarch compile options.
ifndef ARCH
ARCH := $(shell uname -m)
HOSTARCH := $(shell uname -m)
else
HOSTARCH = $(ARCH)
endif
# Catch conflicting usage of ARCH in some BSD environments
@@ -19,10 +21,18 @@ ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), powerpc)
override ARCH=power
else ifeq ($(ARCH), i386)
override ARCH=x86
else ifeq ($(ARCH), armv6)
override ARCH=arm
else ifeq ($(ARCH), armv7)
override ARCH=arm
else ifeq ($(ARCH), aarch64)
override ARCH=arm64
else ifeq ($(ARCH), zarch)
override ARCH=zarch
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -63,6 +73,18 @@ endif
#
# Beginning of system configuration
#
ifneq ($(BUILD_SINGLE),1)
ifneq ($(BUILD_DOUBLE),1)
ifneq ($(BUILD_COMPLEX),1)
ifneq ($(BUILD_COMPLEX16),1)
override BUILD_SINGLE=1
override BUILD_DOUBLE=1
override BUILD_COMPLEX=1
override BUILD_COMPLEX16=1
endif
endif
endif
endif
ifndef HOSTCC
HOSTCC = $(CC)
@@ -82,6 +104,9 @@ endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -103,6 +128,9 @@ endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
ifeq ($(TARGET), POWER8)
GETARCH_FLAGS := -DFORCE_POWER6
endif
endif
@@ -121,6 +149,9 @@ endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), COOPERLAKE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -142,9 +173,9 @@ endif
endif
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
ifeq ($(ARCH), x86_64)
ifneq ($(C_COMPILER), PGI)
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(HOSTARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
@@ -210,7 +241,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
@@ -257,10 +288,10 @@ endif
ARFLAGS =
CPP = $(COMPILER) -E
AR = $(CROSS_SUFFIX)ar
AS = $(CROSS_SUFFIX)as
LD = $(CROSS_SUFFIX)ld
RANLIB = $(CROSS_SUFFIX)ranlib
AR ?= $(CROSS_SUFFIX)ar
AS ?= $(CROSS_SUFFIX)as
LD ?= $(CROSS_SUFFIX)ld
RANLIB ?= $(CROSS_SUFFIX)ranlib
NM = $(CROSS_SUFFIX)nm
DLLWRAP = $(CROSS_SUFFIX)dllwrap
OBJCOPY = $(CROSS_SUFFIX)objcopy
@@ -273,6 +304,25 @@ NO_LAPACK = 1
override FEXTRALIB =
endif
ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
# Note that the behavior of -dumpversion is compile-time-configurable for
# gcc-7.x and newer. Use -dumpfullversion there
ifeq ($(GCCVERSIONGTEQ7),1)
GCCDUMPVERSION_PARAM := -dumpfullversion
else
GCCDUMPVERSION_PARAM := -dumpversion
endif
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif
#
# OS dependent settings
#
@@ -319,13 +369,9 @@ ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# GCC Major version > 4
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
@@ -337,7 +383,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1)
CCOMMON_OPT += -DMS_ABI
endif
endif
endif
# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
@@ -529,7 +574,7 @@ DYNAMIC_CORE += HASWELL ZEN
endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
endif
endif
endif
@@ -544,15 +589,75 @@ endif
ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA53
DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += NEOVERSEN1
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
DYNAMIC_CORE += EMAG8180
DYNAMIC_CORE += THUNDERX3T110
endif
ifeq ($(ARCH), zarch)
DYNAMIC_CORE = ZARCH_GENERIC
# if the compiler accepts -march=arch11 or -march=z13 and can compile a file
# with z13-specific inline assembly, then we can include support for Z13.
# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases
# only support one or the other.
# note: LLVM version 6.x supported -march=z13 yet could not handle vector
# registers in inline assembly, so the check for supporting the -march flag is
# not enough.
ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null
ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1)
ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1)
ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1)
DYNAMIC_CORE += Z13
CCOMMON_OPT += -DDYN_Z13
else
$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it)
endif
# as above for z13, check for -march=arch12 and z14 support in the compiler.
ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1)
ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1)
ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1)
DYNAMIC_CORE += Z14
CCOMMON_OPT += -DDYN_Z14
else
$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it)
endif
endif # ARCH zarch
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
DYNAMIC_CORE += POWER10
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
DYNAMIC_CORE += POWER9
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
ifeq ($(GCCVERSIONGTEQ11), 1)
DYNAMIC_CORE += POWER10
else ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq ($(GCCMINORVERSIONGTEQ2), 1)
DYNAMIC_CORE += POWER10
endif
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
endif
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -607,6 +712,16 @@ endif
ifeq ($(ARCH), arm64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fdefault-integer-8
endif
ifeq ($(F_COMPILER), FLANG)
FCOMMON_OPT += -i8
endif
endif
endif
endif
@@ -652,7 +767,12 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), 1004K)
ifeq ($(CORE), MIPS24K)
CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS)
endif
ifeq ($(CORE), MIPS1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
@@ -697,7 +817,18 @@ endif
ifeq ($(C_COMPILER), PGI)
ifdef BINARY64
CCOMMON_OPT += -tp p7-64
ifeq ($(ARCH), x86_64)
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
else
ifeq ($(ARCH), power)
ifeq ($(CORE), POWER8)
CCOMMON_OPT += -tp pwr8
endif
ifeq ($(CORE), POWER9)
CCOMMON_OPT += -tp pwr9
endif
endif
endif
else
CCOMMON_OPT += -tp p7
endif
@@ -717,6 +848,15 @@ endif
ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64)
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
ifeq ($(FLANG_VENDOR),AOCC)
FCOMMON_OPT += -fno-unroll-loops
endif
endif
endif
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@@ -757,6 +897,9 @@ else
FCOMMON_OPT += -m32
endif
endif
ifneq ($(NO_LAPACKE), 1)
FCOMMON_OPT += -fno-second-underscore
endif
endif
endif
@@ -809,6 +952,7 @@ ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
@@ -848,10 +992,22 @@ ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifeq ($(ARCH), x86_64)
FCOMMON_OPT += -tp p7-64
else
ifeq ($(ARCH), power)
ifeq ($(CORE), POWER8)
FCOMMON_OPT += -tp pwr8
endif
ifeq ($(CORE), POWER9)
FCOMMON_OPT += -tp pwr9
endif
endif
endif
else
FCOMMON_OPT += -tp p7
endif
FCOMMON_OPT += -Mrecursive
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
@@ -1076,6 +1232,22 @@ ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS
endif
ifeq ($(BUILD_BFLOAT16), 1)
CCOMMON_OPT += -DBUILD_BFLOAT16
endif
ifeq ($(BUILD_SINGLE), 1)
CCOMMON_OPT += -DBUILD_SINGLE=1
endif
ifeq ($(BUILD_DOUBLE), 1)
CCOMMON_OPT += -DBUILD_DOUBLE=1
endif
ifeq ($(BUILD_COMPLEX), 1)
CCOMMON_OPT += -DBUILD_COMPLEX=1
endif
ifeq ($(BUILD_COMPLEX16), 1)
CCOMMON_OPT += -DBUILD_COMPLEX16=1
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\"
ifndef SYMBOLPREFIX
@@ -1102,6 +1274,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH)
include $(TOPDIR)/Makefile.$(ARCH)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
endif
CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
ifeq ($(CORE), PPC440)
@@ -1194,7 +1369,6 @@ endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
@@ -1300,6 +1474,8 @@ export OSNAME
export ARCH
export CORE
export LIBCORE
export __BYTE_ORDER__
export ELF_VERSION
export PGCPATH
export CONFIG
export CC
@@ -1345,7 +1521,10 @@ export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE
export NO_AVX512
export BUILD_BFLOAT16
export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N
export SGEMM_UNROLL_M
export SGEMM_UNROLL_N
export DGEMM_UNROLL_M

View File

@@ -1,16 +1,18 @@
SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SBEXTOBJS_P = $(SBEXTOBJS:.$(SUFFIX)=.$(PSUFFIX))
COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))
HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))
BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P)
BLASOBJS = $(SBEXTOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS)
BLASOBJS_P = $(SBEXTOBJS_P) $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P)
ifdef EXPRECISION
BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
@@ -22,19 +24,23 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif
$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SBEXTOBJS) $(SBEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

View File

@@ -54,3 +54,19 @@ LIBATLAS = -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
else
LIBATLAS = -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
endif
ifdef HAVE_SSE3
ifndef DYNAMIC_ARCH
CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3
ifdef HAVE_SSSE3
CCOMMON_OPT += -mssse3
FCOMMON_OPT += -mssse3
endif
ifdef HAVE_SSE4_1
CCOMMON_OPT += -msse4.1
FCOMMON_OPT += -msse4.1
endif
endif
endif

View File

@@ -8,6 +8,21 @@ endif
endif
endif
ifdef HAVE_SSE3
ifndef DYNAMIC_ARCH
CCOMMON_OPT += -msse3
FCOMMON_OPT += -msse3
ifdef HAVE_SSSE3
CCOMMON_OPT += -mssse3
FCOMMON_OPT += -mssse3
endif
ifdef HAVE_SSE4_1
CCOMMON_OPT += -msse4.1
FCOMMON_OPT += -msse4.1
endif
endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
@@ -15,28 +30,70 @@ CCOMMON_OPT += -march=skylake-avx512
FCOMMON_OPT += -march=skylake-avx512
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), HASWELL)
ifeq ($(CORE), COOPERLAKE)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake
FCOMMON_OPT += -march=cooperlake
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
ifndef DYNAMIC_ARCH
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
CCOMMON_OPT += -mavx2
endif
else
ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -mavx2
endif
endif
ifeq ($(F_COMPILER), GFORTRAN)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11)
FCOMMON_OPT += -mavx2
endif
endif
endif
endif
endif

View File

@@ -5,6 +5,12 @@ FCOMMON_OPT += -march=z13 -mzvector
endif
ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
CCOMMON_OPT += -march=z14 -mzvector -O3
FCOMMON_OPT += -march=z14 -mzvector
endif
# Enable floating-point expression contraction for clang, since it is the
# default for gcc
ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -ffp-contract=on
endif

View File

@@ -6,8 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/status.svg?branch=develop)](https://cloud.drone.io/xianyi/OpenBLAS/)
[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
@@ -25,7 +28,10 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
## Installation from Source
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be
sure to use the develop branch - master is several years out of date due to a change of maintainership.)
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
Most can also be given directly on the make or cmake command line.
### Dependencies
@@ -40,7 +46,10 @@ Building OpenBLAS requires the following to be installed:
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
The full target list is in the file `TargetList.txt`. For building with `cmake`, the
usual conventions apply, i.e. create a build directory either underneath the toplevel
OpenBLAS source directory or separate from it, and invoke `cmake` there with the path
to the source tree and any build options you plan to set.
### Cross compile
@@ -53,6 +62,10 @@ Examples:
```sh
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
```
or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI:
```sh
make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A
```
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
```sh
@@ -101,7 +114,7 @@ The default installation directory is `/opt/OpenBLAS`.
## Supported CPUs and Operating Systems
Please read `GotoBLAS_01Readme.txt`.
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
### Additional supported CPUs
@@ -109,14 +122,19 @@ Please read `GotoBLAS_01Readme.txt`.
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
- **AMD ZEN**: Uses Haswell codes with some optimizations.
#### MIPS32
- **MIPS 1004K**: uses P5600 codes
- **MIPS 24K**: uses P5600 codes
#### MIPS64
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
@@ -129,28 +147,61 @@ Please read `GotoBLAS_01Readme.txt`.
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
- **Cortex A57**: Optimized Level-3 and Level-2 functions
- **Cortex A72**: same as A57 ( different cpu specifications)
- **Cortex A73**: same as A57 (different cpu specifications)
- **Falkor**: same as A57 (different cpu specifications)
- **ThunderX**: Optimized some Level-1 functions
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
- **ThunderX3T110**
- **TSV110**: Optimized some Level-3 helper functions
- **EMAG 8180**: preliminary support based on A57
- **Neoverse N1**: (AWS Graviton2) preliminary support
- **Apple Vortex**: preliminary support based on ARMV8
#### PPC/PPC64
- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1`
- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only.
- **POWER10**:
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
- **Z13**: Optimized Level-3 BLAS and Level-1,2
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify `DYNAMIC_OLDER=1`, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option `DYNAMIC_LIST` that allows to specify an individual list of targets to include instead of the default.
`DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
### Supported OS
- **GNU/Linux**
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
## Usage
@@ -182,7 +233,8 @@ We provide the following functions to control the number of threads at runtime:
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
Note that these are only used once at library initialization, and are not available for
fine-tuning thread numbers in individual BLAS calls.
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
## Reporting bugs
@@ -205,7 +257,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.

View File

@@ -22,6 +22,7 @@ SANDYBRIDGE
HASWELL
SKYLAKEX
ATOM
COOPERLAKE
b)AMD CPU:
ATHLON
@@ -49,6 +50,7 @@ POWER6
POWER7
POWER8
POWER9
POWER10
PPCG4
PPC970
PPC970MP
@@ -58,7 +60,8 @@ CELL
3.MIPS CPU:
P5600
1004K
MIPS1004K
MIPS24K
4.MIPS64 CPU:
SICORTEX
@@ -88,10 +91,14 @@ CORTEXA53
CORTEXA57
CORTEXA72
CORTEXA73
NEOVERSEN1
EMAG8180
FALKOR
THUNDERX
THUNDERX2T99
TSV110
THUNDERX3T110
VORTEX
9.System Z:
ZARCH_GENERIC

View File

@@ -38,7 +38,8 @@ environment:
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
WITH_FORTRAN: ignore
- COMPILER: MinGW64-gcc-7.2.0
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-6.3.0-32
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
COMPILER: MinGW-gcc-5.3.0
WITH_FORTRAN: ignore
@@ -62,10 +63,10 @@ before_build:
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..

View File

@@ -49,3 +49,23 @@ jobs:
# we need a privileged docker run for sde process attachment
docker run --privileged intel_sde
displayName: 'Run AVX512 SkylakeX docker build / test'
- job: Windows_cl
pool:
vmImage: 'windows-latest'
steps:
- task: CMake@1
inputs:
workingDirectory: 'build' # Optional
cmakeArgs: '-G "Visual Studio 16 2019" ..'
- task: CMake@1
inputs:
cmakeArgs: '--build . --config Release'
workingDirectory: 'build'
- script: |
cd build
cd utest
dir
openblas_utest.exe

File diff suppressed because it is too large Load Diff

191
benchmark/amax.c Normal file
View File

@@ -0,0 +1,191 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define AMAX BLASFUNC(dzamax)
#else
#define AMAX BLASFUNC(scamax)
#endif
#else
#ifdef DOUBLE
#define AMAX BLASFUNC(damax)
#else
#define AMAX BLASFUNC(samax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

192
benchmark/amin.c Normal file
View File

@@ -0,0 +1,192 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AMIN
#ifdef COMPLEX
#ifdef DOUBLE
#define AMIN BLASFUNC(dzamin)
#else
#define AMIN BLASFUNC(scamin)
#endif
#else
#ifdef DOUBLE
#define AMIN BLASFUNC(damin)
#else
#define AMIN BLASFUNC(samin)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -128,8 +128,13 @@ int main(int argc, char *argv[]){
int to = 200;
int step = 1;
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
double time1,timeg;
#else
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
#endif
argc--;argv++;
@@ -147,7 +152,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -160,26 +165,30 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &start, (struct timezone *)0);
#else
clock_gettime(CLOCK_REALTIME, &start);
#endif
result = ASUM (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
#else
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
#endif
timeg += time1;
}
if (loops >1)
timeg /= loops;
#ifdef COMPLEX

202
benchmark/axpby.c Normal file
View File

@@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPBY
#ifdef COMPLEX
#ifdef DOUBLE
#define AXPBY BLASFUNC(zaxpby)
#else
#define AXPBY BLASFUNC(caxpby)
#endif
#else
#ifdef DOUBLE
#define AXPBY BLASFUNC(daxpby)
#else
#define AXPBY BLASFUNC(saxpby)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT alpha[2] = { 2.0, 2.0 };
FLOAT beta[2] = {2.0, 2.0};
blasint m, i;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++)
{
gettimeofday( &start, (struct timezone *)0);
AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
(COMPSIZE * COMPSIZE * 4. - COMPSIZE) * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -128,7 +128,7 @@ int main(int argc, char *argv[]){
int to = 200;
int step = 1;
struct timeval start, stop;
struct timespec start, stop;
double time1,timeg;
argc--;argv++;
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -175,13 +175,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
clock_gettime( CLOCK_REALTIME, &start);
AXPY (&m, alpha, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
clock_gettime( CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
timeg += time1;
@@ -190,7 +190,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
" %10.2f MFlops %10.9f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -173,46 +173,46 @@ int main(int argc, char *argv[]){
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = 0.;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = 0.;
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
}
}
#else
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = 0.;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = 0.;
}
}
}
@@ -239,10 +239,13 @@ int main(int argc, char *argv[]){
for (j = 0; j < m; j++) {
for(i = 0; i <= j; i++) {
#ifndef COMPLEX
if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]);
if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]))
maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]);
#else
if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]);
if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]);
if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]))
maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]);
if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]))
maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]);
#endif
}
}
@@ -250,10 +253,13 @@ int main(int argc, char *argv[]){
for (j = 0; j < m; j++) {
for(i = j; i < m; i++) {
#ifndef COMPLEX
if (maxerr < fabs(a[i + j * m] - b[i + j * m])) maxerr = fabs(a[i + j * m] - b[i + j * m]);
if (maxerr < fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]))
maxerr = fabs(a[(long)i + (long)j * (long)m] - b[(long)i + (long)j * (long)m]);
#else
if (maxerr < fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0])) maxerr = fabs(a[(i + j * m) * 2 + 0] - b[(i + j * m) * 2 + 0]);
if (maxerr < fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1])) maxerr = fabs(a[(i + j * m) * 2 + 1] - b[(i + j * m) * 2 + 1]);
if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]))
maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 0] - b[((long)i + (long)j * (long)m) * 2 + 0]);
if (maxerr < fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]))
maxerr = fabs(a[((long)i + (long)j * (long)m) * 2 + 1] - b[((long)i + (long)j * (long)m) * 2 + 1]);
#endif
}
}

View File

@@ -129,7 +129,10 @@ int main(int argc, char *argv[]){
int step = 1;
struct timeval start, stop;
double time1,timeg;
double time1 = 0.0, timeg = 0.0;
long nanos = 0;
time_t seconds = 0;
struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
argc--;argv++;
@@ -151,7 +154,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -163,35 +166,32 @@ int main(int argc, char *argv[]){
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++)
{
clock_gettime(CLOCK_REALTIME, &time_start);
COPY (&m, x, &inc_x, y, &inc_y );
clock_gettime(CLOCK_REALTIME, &time_end);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
nanos = time_end.tv_nsec - time_start.tv_nsec;
seconds = time_end.tv_sec - time_start.tv_sec;
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
time1 = seconds + nanos / 1.e9;
timeg += time1;
}
COPY (&m, x, &inc_x, y, &inc_y );
timeg /= loops;
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
fprintf(stderr,
" %10.2f MBytes %12.9f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg / 1.e6, timeg);
}

View File

@@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@@ -195,7 +195,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -214,7 +214,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DOUBLE
#define GEMM BLASFUNC(dgemm)
#elif defined(HALF)
#define GEMM BLASFUNC(sbgemm)
#else
#define GEMM BLASFUNC(sgemm)
#endif
@@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
int main(int argc, char *argv[]){
FLOAT *a, *b, *c;
IFLOAT *a, *b;
FLOAT *c;
FLOAT alpha[] = {1.0, 0.0};
FLOAT beta [] = {0.0, 0.0};
char transa = 'N';
@@ -184,25 +187,25 @@ int main(int argc, char *argv[]){
k = to;
}
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
for (i = 0; i < m * k * COMPSIZE; i++) {
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
}
for (i = 0; i < k * n * COMPSIZE; i++) {
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
}
for (i = 0; i < m * n * COMPSIZE; i++) {
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;

View File

@@ -163,7 +163,7 @@ int main(int argc, char *argv[]){
loops = atoi(p);
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -181,9 +181,9 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

View File

@@ -181,7 +181,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);

View File

@@ -165,7 +165,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -182,7 +182,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

View File

@@ -165,7 +165,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -177,20 +177,20 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
b[i + j * m * COMPSIZE] = 0.0;
b[(long)i + (long)j * (long)m * COMPSIZE] = 0.0;
}
}
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
b[i] += a[(long)i + (long)j * (long)m * COMPSIZE];
}
}

View File

@@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -188,7 +188,7 @@ int main(int argc, char *argv[]){
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

210
benchmark/hbmv.c Normal file
View File

@@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HBMV
#ifdef DOUBLE
#define HBMV BLASFUNC(zhbmv)
#else
#define HBMV BLASFUNC(chbmv)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz) {
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size) {
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {0.0, 0.0};
blasint k = 1;
char uplo='L';
blasint m, i, j;
blasint inc_x=1, inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_K"))) k = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n",
from, to, step, uplo, k, inc_x, inc_y, loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) {
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
for(j = 0; j < m; j++) {
for(i = 0; i < m * COMPSIZE; i++) {
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l = 0; l < loops; l++) {
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -164,9 +164,9 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -178,8 +178,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);

View File

@@ -152,7 +152,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -167,7 +167,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

186
benchmark/her.c Normal file
View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER
#ifdef DOUBLE
#define HER BLASFUNC(zher)
#else
#define HER BLASFUNC(cher)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x;
FLOAT alpha[] = {1.0, 1.0};
blasint incx = 1;
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HER (&uplo, &m, alpha, x, &incx, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}

190
benchmark/her2.c Normal file
View File

@@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER2
#ifdef DOUBLE
#define HER2 BLASFUNC(zher2)
#else
#define HER2 BLASFUNC(cher2)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
blasint inc = 1;
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}

View File

@@ -150,7 +150,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -163,9 +163,9 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -177,8 +177,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);

View File

@@ -149,7 +149,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -162,8 +162,8 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -175,8 +175,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);

207
benchmark/hpmv.c Normal file
View File

@@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HPMV
#ifdef DOUBLE
#define HPMV BLASFUNC(zhpmv)
#else
#define HPMV BLASFUNC(chpmv)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz) {
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size) {
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1, inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");
exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) {
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m, (int)m);
for(j = 0; j < m; j++) {
for(i = 0; i < m * COMPSIZE; i++) {
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l = 0; l < loops; l++) {
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -181,7 +181,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

192
benchmark/iamin.c Normal file
View File

@@ -0,0 +1,192 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMIN
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMIN BLASFUNC(izamin)
#else
#define IAMIN BLASFUNC(icamin)
#endif
#else
#ifdef DOUBLE
#define IAMIN BLASFUNC(idamin)
#else
#define IAMIN BLASFUNC(isamin)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

186
benchmark/imax.c Normal file
View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IMAX
#ifndef COMPLEX
#ifdef DOUBLE
#define IMAX BLASFUNC(idmax)
#else
#define IMAX BLASFUNC(ismax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

186
benchmark/imin.c Normal file
View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IMIN
#ifndef COMPLEX
#ifdef DOUBLE
#define IMIN BLASFUNC(idmin)
#else
#define IMIN BLASFUNC(ismin)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -174,7 +174,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -186,7 +186,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -194,7 +194,7 @@ int main(int argc, char *argv[]){
for (j = 0; j < m; ++j) {
for (i = 0; i < m * COMPSIZE; ++i) {
b[i] += a[i + j * m * COMPSIZE];
b[i] += a[(long)i + (long)j * (long)m * COMPSIZE];
}
}

185
benchmark/max.c Normal file
View File

@@ -0,0 +1,185 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NAMAX
#ifndef COMPLEX
#ifdef DOUBLE
#define NAMAX BLASFUNC(dmax)
#else
#define NAMAX BLASFUNC(smax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
NAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

185
benchmark/min.c Normal file
View File

@@ -0,0 +1,185 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NAMIN
#ifndef COMPLEX
#ifdef DOUBLE
#define NAMIN BLASFUNC(dmin)
#else
#define NAMIN BLASFUNC(smin)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
NAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@@ -170,46 +170,46 @@ int main(int argc, char *argv[]){
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = 0.;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[i + j * m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[j + j * m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[i + j * m] = 0.;
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
}
}
#else
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = 0.;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) {
a[(i + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(i + j * m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
a[(j + j * m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(j + j * m) * 2 + 1] = 0.;
a[((long)j + (long)j * (long)m) * 2 + 0] = ((double) rand() / (double) RAND_MAX) + 8.;
a[((long)j + (long)j * (long)m) * 2 + 1] = 0.;
for(i = j + 1; i < m; i++) {
a[(i + j * m) * 2 + 0] = 0.;
a[(i + j * m) * 2 + 1] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 0] = 0.;
a[((long)i + (long)j * (long)m) * 2 + 1] = 0.;
}
}
}

View File

@@ -32,9 +32,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#include "common.h"
#undef ROT
#undef DOT
#ifndef COMPLEX
#ifdef DOUBLE
#define ROT BLASFUNC(drot)
@@ -42,6 +42,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ROT BLASFUNC(srot)
#endif
#else
#ifdef DOUBLE
#define ROT BLASFUNC(zdrot)
#else
#define ROT BLASFUNC(csrot)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
@@ -147,7 +156,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -160,17 +169,16 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
ROT (&m, x, &inc_x, y, &inc_y, c, s);
@@ -179,13 +187,13 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
timeg += time1;
}
}
timeg /= loops;
timeg /= loops;
fprintf(stderr,
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);

210
benchmark/rotm.c Normal file
View File

@@ -0,0 +1,210 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ROTM
#ifdef DOUBLE
#define ROTM BLASFUNC(drotm)
#else
#define ROTM BLASFUNC(srotm)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz)
{
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv) {
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =
shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {
printf("Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf("Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *x, *y;
// FLOAT result;
blasint m, i;
blasint inc_x = 1, inc_y = 1;
FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0};
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1, timeg;
argc--;
argv++;
if (argc > 0) {
from = atol(*argv);
argc--;
argv++;
}
if (argc > 0) {
to = MAX(atol(*argv), from);
argc--;
argv++;
}
if (argc > 0) {
step = atol(*argv);
argc--;
argv++;
}
if ((p = getenv("OPENBLAS_LOOPS")))
loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX")))
inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY")))
inc_y = atoi(p);
fprintf(
stderr,
"From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n",
from, to, step, inc_x, inc_y, loops);
if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) ==
NULL) {
fprintf(stderr, "Out of Memory!!\n");
exit(1);
}
if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) ==
NULL) {
fprintf(stderr, "Out of Memory!!\n");
exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for (m = from; m <= to; m += step) {
timeg = 0;
fprintf(stderr, " %6d : ", (int)m);
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
}
for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) {
y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
gettimeofday(&start, (struct timezone *)0);
ROTM(&m, x, &inc_x, y, &inc_y, param);
gettimeofday(&stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) +
(double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}

View File

@@ -150,7 +150,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

219
benchmark/spmv.c Normal file
View File

@@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPMV
#ifndef COMPLEX
#ifdef DOUBLE
#define SPMV BLASFUNC(dspmv)
#else
#define SPMV BLASFUNC(sspmv)
#endif
#else
#ifdef DOUBLE
#define SPMV BLASFUNC(zspmv)
#else
#define SPMV BLASFUNC(cspmv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

198
benchmark/spr.c Executable file
View File

@@ -0,0 +1,198 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPR
#ifdef DOUBLE
#define SPR BLASFUNC(dspr)
#else
#define SPR BLASFUNC(sspr)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a,*c;
FLOAT alpha[] = {1.0, 1.0};
blasint inc_x=1;
int loops = 1;
int l;
char *p;
char uplo='U';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SPR (&uplo, &m, alpha, c, &inc_x, a);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

207
benchmark/spr2.c Executable file
View File

@@ -0,0 +1,207 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPR2
#ifdef DOUBLE
#define SPR2 BLASFUNC(dspr2)
#else
#define SPR2 BLASFUNC(sspr2)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a,*b,*c;
FLOAT alpha[] = {1.0, 1.0};
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
char uplo='U';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops Time\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -151,7 +151,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -175,9 +175,9 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -189,8 +189,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);

View File

@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -177,7 +177,7 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

185
benchmark/syr.c Normal file
View File

@@ -0,0 +1,185 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR
#ifdef DOUBLE
#define SYR BLASFUNC(dsyr)
#else
#define SYR BLASFUNC(ssyr)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x,*a;
FLOAT alpha[] = {1.0, 1.0};
char *p;
char uplo='U';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
blasint inc_x= 1;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d\n", from, to, step,uplo,inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

194
benchmark/syr2.c Normal file
View File

@@ -0,0 +1,194 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR2
#ifdef DOUBLE
#define SYR2 BLASFUNC(dsyr2)
#else
#define SYR2 BLASFUNC(ssyr2)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x, *y, *a;
FLOAT alpha[] = {1.0, 1.0};
char *p;
char uplo='U';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
blasint inc_x= 1;
blasint inc_y= 1;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Inc_x = %d Inc_y = %d\n", from, to, step,uplo,inc_x,inc_y);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
gettimeofday( &start, (struct timezone *)0);
SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -175,9 +175,9 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -189,8 +189,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)m / time1 * 1.e-6);

View File

@@ -159,7 +159,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -172,8 +172,8 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
c[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -185,8 +185,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);

172
benchmark/tpmv.c Normal file
View File

@@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPMV
#ifndef COMPLEX
#ifdef DOUBLE
#define TPMV BLASFUNC(dtpmv)
#else
#define TPMV BLASFUNC(stpmv)
#endif
#else
#ifdef DOUBLE
#define TPMV BLASFUNC(ztpmv)
#else
#define TPMV BLASFUNC(ctpmv)
#endif
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *a, *x;
char *p;
char uplo ='U';
char trans='N';
char diag ='U';
int loops = 1;
int l;
blasint inc_x=1;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint n, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
to, step, uplo, trans, diag, loops, inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(n = from; n <= to; n += step) {
timeg=0;
fprintf(stderr, " %6d : ", (int)n);
for(j = 0; j < n; j++) {
for(i = 0; i < n * COMPSIZE; i++) {
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start);
TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

172
benchmark/tpsv.c Normal file
View File

@@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPSV
#ifndef COMPLEX
#ifdef DOUBLE
#define TPSV BLASFUNC(dtpsv)
#else
#define TPSV BLASFUNC(stpsv)
#endif
#else
#ifdef DOUBLE
#define TPSV BLASFUNC(ztpsv)
#else
#define TPSV BLASFUNC(ctpsv)
#endif
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *a, *x;
char *p;
char uplo ='U';
char trans='N';
char diag ='U';
int loops = 1;
int l;
blasint inc_x=1;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint n, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
to, step, uplo, trans, diag, loops, inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(n = from; n <= to; n += step) {
timeg=0;
fprintf(stderr, " %6d : ", (int)n);
for(j = 0; j < n; j++) {
for(i = 0; i < n * COMPSIZE; i++) {
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start);
TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -162,7 +162,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -175,8 +175,8 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
@@ -188,8 +188,6 @@ int main(int argc, char *argv[]){
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);

172
benchmark/trmv.c Normal file
View File

@@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRMV
#ifndef COMPLEX
#ifdef DOUBLE
#define TRMV BLASFUNC(dtrmv)
#else
#define TRMV BLASFUNC(strmv)
#endif
#else
#ifdef DOUBLE
#define TRMV BLASFUNC(ztrmv)
#else
#define TRMV BLASFUNC(ctrmv)
#endif
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *a, *x;
char *p;
char uplo ='U';
char trans='N';
char diag ='U';
int loops = 1;
int l;
blasint inc_x=1;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint n, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
to, step, uplo, trans, diag, loops, inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(n = from; n <= to; n += step) {
timeg=0;
fprintf(stderr, " %6d : ", (int)n);
for(j = 0; j < n; j++) {
for(i = 0; i < n * COMPSIZE; i++) {
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start);
TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}

222
benchmark/trsv.c Normal file
View File

@@ -0,0 +1,222 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include <time.h>
#include "common.h"
#undef GEMV
#undef TRSV
#ifndef COMPLEX
#ifdef DOUBLE
#define TRSV BLASFUNC(dtrsv)
#else
#define TRSV BLASFUNC(strsv)
#endif
#else
#ifdef DOUBLE
#define TRSV BLASFUNC(ztrsv)
#else
#define TRSV BLASFUNC(ctrsv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x;
blasint n = 0, i, j;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timespec time_start, time_end;
time_t seconds = 0;
double time1,timeg;
long long nanos = 0;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
char uplo ='L';
char transa = 'N';
char diag ='U';
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_TRANSA"))) transa=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Transa = '%c' Inc_x = %d uplo=%c diag=%c loop = %d\n", from, to, step,transa,inc_x,
uplo,diag,loops);
#ifdef __linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
fprintf(stderr, "============================================\n");
for(n = from; n <= to; n += step)
{
timeg=0;
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * n * n * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * n * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(j = 0; j < n; j++){
for(i = 0; i < n * COMPSIZE; i++){
a[i + j * n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(l =0;l< loops;l++){
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start);
TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end);
nanos = time_end.tv_nsec - time_start.tv_nsec;
seconds = time_end.tv_sec - time_start.tv_sec;
time1 = seconds + nanos /1.e9;
timeg += time1;
}
timeg /= loops;
long long muls = n*(n+1)/2.0;
long long adds = (n - 1.0)*n/2.0;
fprintf(stderr, "%10d %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
if(a != NULL){
free(a);
}
if( x != NULL){
free(x);
}
}
return 0;
}

View File

@@ -146,7 +146,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif

View File

@@ -145,7 +145,7 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
#ifdef __linux
srandom(getpid());
#endif
@@ -170,9 +170,11 @@ int main(int argc, char *argv[]){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
#ifdef RETURN_BY_STACK
DOT (&result , &m, x, &inc_x, y, &inc_y );
#else
result = DOT (&m, x, &inc_x, y, &inc_y );
#endif
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;

55
c_check
View File

@@ -6,8 +6,9 @@
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = `uname -p` if ($hostos eq "AIX");
$hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
@@ -18,11 +19,12 @@ $binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
$config = shift(@ARGV);
$compiler_name = join(" ", @ARGV);
$compiler_name = shift(@ARGV);
$flags = join(" ", @ARGV);
# First, we need to know the target OS and compiler name
$data = `$compiler_name -E ctest.c`;
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
@@ -175,7 +177,7 @@ if ($defined == 0) {
# Do again
$data = `$compiler_name -E ctest.c`;
$data = `$compiler_name $flags -E ctest.c`;
if ($?) {
printf STDERR "C Compiler ($compiler_name) is something wrong.\n";
@@ -188,14 +190,14 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$tmpf = new File::Temp( UNLINK => 1 );
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
$args = "$msa_flags -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
@@ -229,11 +231,14 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
if ($compiler eq "PGI") {
$args = " -tp skylake -c -o $tmpf.o $tmpf";
}
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
@@ -244,7 +249,29 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
}
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$c11_atomics = 0;
if ($data =~ /HAVE_C11/) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11";
$c11_atomics = 0;
} else {
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
print $tmpf "#include <stdatomic.h>\nint main(void){}\n";
$args = " -c -o $tmpf.o $tmpf";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$c11_atomics = 0;
} else {
$c11_atomics = 1;
}
unlink("$tmpf.o");
}
}
$data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/;
@@ -267,7 +294,7 @@ $linker_l = "";
$linker_a = "";
{
$link = `$compiler_name -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link = `$compiler_name $flags -c ctest2.c -o ctest2.o 2>&1 && $compiler_name $flags $openmp -v ctest2.o -o ctest2 2>&1 && rm -f ctest2.o ctest2 ctest2.exe`;
$link =~ s/\-Y\sP\,/\-Y/g;
@@ -305,6 +332,8 @@ $linker_a = "";
&& ($flags !~ /kernel32/)
&& ($flags !~ /advapi32/)
&& ($flags !~ /shell32/)
&& ($flags !~ /omp/)
&& ($flags !~ /[0-9]+/)
) {
$linker_l .= $flags . " "
}
@@ -345,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
if ($os eq "LINUX") {

16
cblas.h
View File

@@ -25,6 +25,11 @@ char* openblas_get_config(void);
/*Get the CPU corename on runtime.*/
char* openblas_get_corename(void);
#ifdef OPENBLAS_OS_LINUX
/* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
#endif
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
@@ -377,6 +382,17 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc);
/*** BFLOAT16 and INT8 extensions ***/
/* convert float array to BFLOAT16 array by rounding */
void cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert double array to BFLOAT16 array by rounding */
void cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to float array */
void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to double array */
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
#ifdef __cplusplus
}

View File

@@ -1,4 +1,3 @@
##
## Author: Hank Anderson <hank@statease.com>
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets various variables based on architecture.
@@ -45,7 +44,11 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
endif ()
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
endif ()
if (X86)
@@ -72,12 +75,16 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()
if (EXISTS ${PROJECT_SOURCE_DIR}/config_kernel.h)
message (FATAL_ERROR "Your build directory contains a file config_kernel.h, probably from a previous compilation with make. This will conflict with the cmake compilation and cause strange compiler errors - please remove the file before trying again")
endif ()
if (NOT DYNAMIC_CORE)

View File

@@ -3,7 +3,7 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets C related variables.
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
else ()
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
if (BINARY64)
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
else ()
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
if (MIPS64)
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
endif ()
endif ()
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
if (X86)
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
@@ -96,3 +96,47 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
endif ()
endif ()
if (${CORE} STREQUAL "SKYLAKEX")
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif ()
endif ()
endif ()
if (${CORE} STREQUAL "COOPERLAKE")
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=cooperlake")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif()
endif ()
endif ()
endif ()
if (NOT DYNAMIC_ARCH)
if (HAVE_AVX2)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2")
endif ()
if (HAVE_AVX)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
endif ()
if (HAVE_SSE)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
endif ()
if (HAVE_SSE2)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse2")
endif ()
if (HAVE_SSE3)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse3")
endif ()
if (HAVE_SSSE3)
set (CCOMMON_OPT "${CCOMMON_OPT} -mssse3")
endif ()
if (HAVE_SSE4_1)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse4.1")
endif ()
endif()

View File

@@ -21,7 +21,15 @@
# NEED2UNDERSCORES
if (NOT NO_LAPACK)
enable_language(Fortran)
include(CheckLanguage)
check_language(Fortran)
if(CMAKE_Fortran_COMPILER)
enable_language(Fortran)
else()
message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK")
set (NOFORTRAN 1)
set (NO_LAPACK 1)
endif()
else()
include(CMakeForceCompiler)
CMAKE_FORCE_Fortran_COMPILER(gfortran GNU)

View File

@@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee")
endif ()
if (${F_COMPILER} STREQUAL "G77")
@@ -81,6 +82,7 @@ if (${F_COMPILER} STREQUAL "INTEL")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -recursive")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -openmp")
endif ()
@@ -120,6 +122,7 @@ if (${F_COMPILER} STREQUAL "PGI")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -tp p7")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive")
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -mp")
endif ()

View File

@@ -113,11 +113,33 @@ macro(SetDefaultL1)
set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S)
if (BUILD_BFLOAT16)
set(SHAMINKERNEL ../arm/amin.c)
set(SHAMAXKERNEL ../arm/amax.c)
set(SHMAXKERNEL ../arm/max.c)
set(SHMINKERNEL ../arm/min.c)
set(ISHAMAXKERNEL ../arm/iamax.c)
set(ISHAMINKERNEL ../arm/iamin.c)
set(ISHMAXKERNEL ../arm/imax.c)
set(ISHMINKERNEL ../arm/imin.c)
set(SHASUMKERNEL ../arm/asum.c)
set(SHAXPYKERNEL ../arm/axpy.c)
set(SHAXPBYKERNEL ../arm/axpby.c)
set(SHCOPYKERNEL ../arm/copy.c)
set(SBDOTKERNEL ../x86_64/sbdot.c)
set(SHROTKERNEL ../arm/rot.c)
set(SHSCALKERNEL ../arm/scal.c)
set(SHNRM2KERNEL ../arm/nrm2.c)
set(SHSUMKERNEL ../arm/sum.c)
set(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c)
endif ()
endmacro ()
macro(SetDefaultL2)
set(SGEMVNKERNEL gemv_n.S)
set(SGEMVTKERNEL gemv_t.S)
set(SGEMVNKERNEL ../arm/gemv_n.c)
set(SGEMVTKERNEL ../arm/gemv_t.c)
set(DGEMVNKERNEL gemv_n.S)
set(DGEMVTKERNEL gemv_t.S)
set(CGEMVNKERNEL zgemv_n.S)
@@ -161,6 +183,11 @@ macro(SetDefaultL2)
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_BFLOAT16)
set(SBGEMVNKERNEL ../arm/gemv_n.c)
set(SBGEMVTKERNEL ../arm/gemv_t.c)
set(SHGERKERNEL ../generic/ger.c)
endif ()
endmacro ()
macro(SetDefaultL3)
@@ -168,4 +195,18 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c)
if (BUILD_BFLOAT16)
set(SHGEADD_KERNEL ../generic/geadd.c)
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
set(SBGEMM_BETA ../generic/gemm_beta.c)
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
endif ()
endmacro ()

View File

@@ -1,11 +1,12 @@
# Sources for compiling lapack-netlib. Can't use CMakeLists.txt because lapack-netlib already has its own cmake files.
set(ALLAUX ilaenv.f ilaenv2stage.f ieeeck.f lsamen.f iparmq.f iparam2stage.F
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f dlaset.f
../INSTALL/ilaver.f xerbla_array.f
../INSTALL/slamch.f)
set(SCLAUX
scombssq.f sbdsvdx.f sstevx.f sstein.f
sbdsdc.f
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f
@@ -25,6 +26,7 @@ set(SCLAUX
set(DZLAUX
dbdsdc.f
dbdsvdx.f dstevx.f dstein.f
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
@@ -35,14 +37,14 @@ set(DZLAUX
dlartg.f dlaruv.f dlas2.f dlascl.f
dlasd0.f dlasd1.f dlasd2.f dlasd3.f dlasd4.f dlasd5.f dlasd6.f
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
dlaset.f dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
dlasr.f dlasrt.f dlassq.f dlasv2.f dpttrf.f dstebz.f dstedc.f
dsteqr.f dsterf.f dlaisnan.f disnan.f
dlartgp.f dlartgs.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
set(SLASRC
sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
sgehd2.f sgehrd.f sgelq2.f sgelqf.f
@@ -83,8 +85,8 @@ set(SLASRC
ssbev.f ssbevd.f ssbevx.f ssbgst.f ssbgv.f ssbgvd.f ssbgvx.f
ssbtrd.f sspcon.f sspev.f sspevd.f sspevx.f sspgst.f
sspgv.f sspgvd.f sspgvx.f ssprfs.f sspsv.f sspsvx.f ssptrd.f
ssptrf.f ssptri.f ssptrs.f sstegr.f sstein.f sstev.f sstevd.f sstevr.f
sstevx.f ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f
ssptrf.f ssptri.f ssptrs.f sstegr.f sstev.f sstevd.f sstevr.f
ssycon.f ssyev.f ssyevd.f ssyevr.f ssyevx.f ssygs2.f
ssygst.f ssygv.f ssygvd.f ssygvx.f ssyrfs.f ssysv.f ssysvx.f
ssytd2.f ssytf2.f ssytrd.f ssytrf.f ssytri.f ssytri2.f ssytri2x.f
ssyswapr.f ssytrs.f ssytrs2.f
@@ -115,7 +117,9 @@ set(SLASRC
stplqt.f stplqt2.f stpmlqt.f
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -210,7 +214,9 @@ set(CLASRC
ctplqt.f ctplqt2.f ctpmlqt.f
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cunhr_col.f )
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -225,7 +231,7 @@ set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_lin_berr.f clarscl2.f clascl2.f cla_wwaddw.f)
set(DLASRC
dbdsvdx.f dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
dgbbrd.f dgbcon.f dgbequ.f dgbrfs.f dgbsv.f
dgbsvx.f dgbtf2.f dgbtrf.f dgbtrs.f dgebak.f dgebal.f dgebd2.f
dgebrd.f dgecon.f dgeequ.f dgees.f dgeesx.f dgeev.f dgeevx.f
dgehd2.f dgehrd.f dgelq2.f dgelqf.f
@@ -266,8 +272,8 @@ set(DLASRC
dsbev.f dsbevd.f dsbevx.f dsbgst.f dsbgv.f dsbgvd.f dsbgvx.f
dsbtrd.f dspcon.f dspev.f dspevd.f dspevx.f dspgst.f
dspgv.f dspgvd.f dspgvx.f dsprfs.f dspsv.f dspsvx.f dsptrd.f
dsptrf.f dsptri.f dsptrs.f dstegr.f dstein.f dstev.f dstevd.f dstevr.f
dstevx.f dsycon.f dsyev.f dsyevd.f dsyevr.f
dsptrf.f dsptri.f dsptrs.f dstegr.f dstev.f dstevd.f dstevr.f
dsycon.f dsyev.f dsyevd.f dsyevr.f
dsyevx.f dsygs2.f dsygst.f dsygv.f dsygvd.f dsygvx.f dsyrfs.f
dsysv.f dsysvx.f
dsytd2.f dsytf2.f dsytrd.f dsytrf.f dsytri.f dsytrs.f dsytrs2.f
@@ -299,7 +305,9 @@ set(DLASRC
dtplqt.f dtplqt2.f dtpmlqt.f
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -398,7 +406,9 @@ set(ZLASRC
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zunhr_col.f)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@@ -466,12 +476,16 @@ endif()
if(BUILD_COMPLEX)
set(LA_REL_SRC ${LA_REL_SRC} ${CLASRC} ${ZCLASRC} ${ALLAUX} ${SCLAUX})
SET(LA_GEN_SRC ${LA_GEN_SRC} ${CMATGEN} ${SCATGEN})
message(STATUS "Building Complex Precision")
message(STATUS "Building Single Precision Complex")
endif()
if(BUILD_COMPLEX16)
set(LA_REL_SRC ${LA_REL_SRC} ${ZLASRC} ${ZCLASRC} ${ALLAUX} ${DZLAUX})
SET(LA_GEN_SRC ${LA_GEN_SRC} ${ZMATGEN} ${DZATGEN})
message(STATUS "Building Double Complex Precision")
# for zlange/zlanhe
if (NOT BUILD_DOUBLE)
set (LA_REL_SRC ${LA_REL_SRC} dcombssq.f)
endif ()
message(STATUS "Building Double Precision Complex")
endif()
# add lapack-netlib folder to the sources

View File

@@ -715,6 +715,8 @@ set(DSRC
lapacke_dgesv_work.c
lapacke_dgesvd.c
lapacke_dgesvd_work.c
lapacke_dgesvdq.c
lapacke_dgesvdq_work.c
lapacke_dgesvdx.c
lapacke_dgesvdx_work.c
lapacke_dgesvj.c
@@ -1287,6 +1289,8 @@ set(SSRC
lapacke_sgesv_work.c
lapacke_sgesvd.c
lapacke_sgesvd_work.c
lapacke_sgesvdq.c
lapacke_sgesvdq_work.c
lapacke_sgesvdx.c
lapacke_sgesvdx_work.c
lapacke_sgesvj.c
@@ -1853,6 +1857,8 @@ set(ZSRC
lapacke_zgesv_work.c
lapacke_zgesvd.c
lapacke_zgesvd_work.c
lapacke_zgesvdq.c
lapacke_zgesvdq_work.c
lapacke_zgesvdx.c
lapacke_zgesvdx_work.c
lapacke_zgesvj.c

View File

@@ -7,5 +7,5 @@ Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas${libsuffix}
Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix}
Cflags: -I${includedir}

View File

@@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
set(NO_EXPRECISION 1)
endif ()
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly")
if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
set(EXTRALIB "${EXTRALIB} -lm")
set(NO_EXPRECISION 1)
endif ()

View File

@@ -16,6 +16,8 @@
# HAVE_SSE2
# HAVE_SSE3
# MAKE
# SBGEMM_UNROLL_M
# SBGEMM_UNROLL_N
# SGEMM_UNROLL_M
# SGEMM_UNROLL_N
# DGEMM_UNROLL_M
@@ -105,8 +107,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
# Perhaps this should be inside a different file as it grows larger
file(APPEND ${TARGET_CONF_TEMP}
"#define ${TCORE}\n"
"#define CORE_${TCORE}\n"
"#define CHAR_CORENAME \"${TCORE}\"\n")
if ("${TCORE}" STREQUAL "ARMV7")
if ("${TCORE}" STREQUAL "CORE2")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L2_SIZE\t1048576\n"
"#define L2_LINESIZE\t64\n"
"#define DTB_DEFAULT_ENTRIES\t256\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_CMOV\n"
"#define HAVE_MMX\n"
"#define HAVE_SSE\n"
"#define HAVE_SSE2\n"
"#define HAVE_SSE3\n"
"#define HAVE_SSSE3\n"
"#define SLOCAL_BUFFER_SIZE\t16384\n"
"#define DLOCAL_BUFFER_SIZE\t16384\n"
"#define CLOCAL_BUFFER_SIZE\t16384\n"
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 4)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
set(CGEMM3M_UNROLL_M 8)
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
@@ -121,6 +154,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
elseif ("${TCORE}" STREQUAL "ARMV8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t32768\n"
@@ -158,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
if ("${TCORE}" STREQUAL "CORTEXA57")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
else ()
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
endif ()
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
@@ -194,6 +236,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "NEOVERSEN1")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t2\n"
"#define L2_SIZE\t1048576\n\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t16\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define HAVE_VFPV4\n"
"#define HAVE_VFPV3\n"
"#define HAVE_VFP\n"
"#define HAVE_NEON\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "FALKOR")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t65536\n"
@@ -274,7 +343,136 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "THUNDERX3T110")
file(APPEND ${TARGET_CONF_TEMP}
"#define THUNDERX3T110\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t8\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t8\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define L3_SIZE\t94371840\n"
"#define L3_LINESIZE\t64\n"
"#define L3_ASSOCIATIVE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define ARMV8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "TSV110")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "EMAG8180")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t32768\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t5262144\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
endif()
set(SBGEMM_UNROLL_M 8)
set(SBGEMM_UNROLL_N 4)
# Or should this actually be NUM_CORES?
if (${NUM_THREADS} GREATER 0)
@@ -309,6 +507,9 @@ else(NOT CMAKE_CROSSCOMPILING)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
else()
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
if (DEFINED TARGET_CORE)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
endif ()
endif ()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
@@ -323,7 +524,7 @@ else(NOT CMAKE_CROSSCOMPILING)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
@@ -351,7 +552,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)

View File

@@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@@ -45,6 +45,18 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
endif ()
if (DEFINED TARGET)
if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
# endif()
endif()
if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
@@ -58,6 +70,21 @@ if (DEFINED TARGET)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_SSE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
endif()
if (DEFINED HAVE_SSE2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
endif()
if (DEFINED HAVE_SSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
endif()
if (DEFINED HAVE_SSSE3)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
endif()
if (DEFINED HAVE_SSE4_1)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
endif()
endif()
if (DEFINED TARGET)
@@ -66,7 +93,7 @@ if (DEFINED TARGET)
endif ()
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
if (X86_64)
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
endif ()
@@ -98,6 +125,11 @@ if (NO_AVX2)
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX2")
endif ()
if (NO_AVX512)
message(STATUS "Disabling Advanced Vector Extensions 512 (AVX512).")
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX512")
endif ()
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
set(GETARCH_FLAGS "${GETARCH_FLAGS} ${CMAKE_C_FLAGS_DEBUG}")
endif ()
@@ -289,10 +321,30 @@ set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (BUFFERSIZE)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUFFERSIZE=${BUFFERSIZE}")
endif ()
if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if (DEFINED MAX_STACK_ALLOC)
if (NOT ${MAX_STACK_ALLOC} EQUAL 0)
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}")
endif ()
else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
endif ()
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
endif()
endif()
endif()
if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
else ()
@@ -362,6 +414,25 @@ set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}")
if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_COMPLEX16)
set (BUILD_SINGLE ON)
set (BUILD_DOUBLE ON)
set (BUILD_COMPLEX ON)
set (BUILD_COMPLEX16 ON)
endif()
if (BUILD_SINGLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE")
endif()
if (BUILD_DOUBLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE")
endif()
if (BUILD_COMPLEX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX")
endif()
if (BUILD_COMPLEX16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
endif()
if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
endif()
@@ -403,6 +474,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows
set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE")
endif ()
if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release")
if ("${F_COMPILER}" STREQUAL "FLANG")
if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3)
set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops")
endif ()
endif ()
endif ()
if (NOT DEFINED SUFFIX)
set(SUFFIX o)
endif ()
@@ -526,6 +605,8 @@ endif ()
#export FUNCTION_PROFILE
#export TARGET_CORE
#
#export SBGEMM_UNROLL_M
#export SBGEMM_UNROLL_N
#export SGEMM_UNROLL_M
#export SGEMM_UNROLL_N
#export DGEMM_UNROLL_M

View File

@@ -39,21 +39,45 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
else()
set(X86 1)
if (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1)
else()
set(ARM 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
set(ARM 1)
elseif (${CMAKE_CROSSCOMPILING})
if (${TARGET} STREQUAL "CORE2")
if (NOT BINARY)
set(X86 1)
elseif (${BINARY} EQUAL "64")
set(X86_64 1)
else ()
set(X86 1)
endif()
elseif (${TARGET} STREQUAL "ARMV7")
set(ARM 1)
else()
set(ARM64 1)
endif ()
else ()
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
endif()
if (X86_64)
@@ -85,11 +109,18 @@ else()
endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NOT NO_AVX512)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.c "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o ${PROJECT_BINARY_DIR}/avx512.c OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
file(REMOVE "avx512.c" "avx512.o")
endif()
endif()
include(CheckIncludeFile)
CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11)
if (HAVE_C11 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11")
endif()

View File

@@ -15,12 +15,36 @@ endfunction ()
# Reads a Makefile into CMake vars.
macro(ParseMakefileVars MAKEFILE_IN)
message(STATUS "Reading vars from ${MAKEFILE_IN}...")
set (IfElse 0)
set (ElseSeen 0)
file(STRINGS ${MAKEFILE_IN} makefile_contents)
foreach (makefile_line ${makefile_contents})
#message(STATUS "parsing ${makefile_line}")
if (${IfElse} GREATER 0)
string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ENDIF ${makefile_line}")
set (IfElse 0)
set (ElseSeen 0)
continue ()
endif ()
string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "ELSE ${makefile_line}")
set (ElseSeen 1)
continue ()
endif()
if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1))
# message(STATUS "skipping ${makefile_line}")
continue ()
endif ()
endif ()
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on ${line_match}")
set(var_name ${CMAKE_MATCH_1})
set(var_value ${CMAKE_MATCH_2})
# set(var_value ${CMAKE_MATCH_2})
string(STRIP ${CMAKE_MATCH_2} var_value)
# check for Makefile variables in the string, e.g. $(TSUFFIX)
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
foreach (make_var ${make_var_matches})
@@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN)
else ()
string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
#message(STATUS "match on include ${line_match}")
ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1})
else ()
# message(STATUS "unmatched line ${line_match}")
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
# message (STATUS "condition is true")
set (IfElse 1)
else ()
set (IfElse 2)
endif ()
else ()
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
# message (STATUS "condition is true")
set (IfElse 1)
else ()
set (IfElse 2)
endif ()
endif ()
endif ()
endif ()
endif ()
endforeach ()
@@ -163,6 +211,7 @@ function(GenerateNamedObjects sources_in)
if (complex_only)
list(REMOVE_ITEM float_list "SINGLE")
list(REMOVE_ITEM float_list "DOUBLE")
list(REMOVE_ITEM float_list "BFLOAT16")
elseif (real_only)
list(REMOVE_ITEM float_list "COMPLEX")
list(REMOVE_ITEM float_list "ZCOMPLEX")
@@ -176,6 +225,9 @@ function(GenerateNamedObjects sources_in)
if (NOT no_float_type)
string(SUBSTRING ${float_type} 0 1 float_char)
string(TOLOWER ${float_char} float_char)
if (${float_type} STREQUAL "BFLOAT16")
set (float_char "sb")
endif ()
endif ()
if (NOT name_in)
@@ -210,6 +262,9 @@ function(GenerateNamedObjects sources_in)
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "DOUBLE")
endif ()
if (${float_type} STREQUAL "BFLOAT16")
list(APPEND obj_defines "BFLOAT16")
endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "COMPLEX")
if (mangle_complex_sources)

View File

@@ -257,6 +257,12 @@ typedef long BLASLONG;
typedef unsigned long BLASULONG;
#endif
#ifndef bfloat16
#include <stdint.h>
typedef uint16_t bfloat16;
#define BFLOAT16CONVERSION 1
#endif
#ifdef USE64BITINT
typedef BLASLONG blasint;
#if defined(OS_WINDOWS) && defined(__64BIT__)
@@ -297,6 +303,13 @@ typedef int blasint;
#define SIZE 8
#define BASE_SHIFT 3
#define ZBASE_SHIFT 4
#elif defined(BFLOAT16)
#define IFLOAT bfloat16
#define XFLOAT IFLOAT
#define FLOAT float
#define SIZE 2
#define BASE_SHIFT 1
#define ZBASE_SHIFT 2
#else
#define FLOAT float
#define SIZE 4
@@ -308,6 +321,10 @@ typedef int blasint;
#define XFLOAT FLOAT
#endif
#ifndef IFLOAT
#define IFLOAT FLOAT
#endif
#ifndef COMPLEX
#define COMPSIZE 1
#else
@@ -335,7 +352,7 @@ typedef int blasint;
#endif
#if defined(ARMV7) || defined(ARMV6) || defined(ARMV8) || defined(ARMV5)
#define YIELDING asm volatile ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop; \n");
#endif
#ifdef BULLDOZER
@@ -344,13 +361,8 @@ typedef int blasint;
#endif
#endif
#ifdef POWER8
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
#ifdef POWER9
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
@@ -390,7 +402,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#endif
#ifndef BLAS3_MEM_ALLOC_THRESHOLD
#define BLAS3_MEM_ALLOC_THRESHOLD 160
#define BLAS3_MEM_ALLOC_THRESHOLD 32
#endif
#ifdef QUAD_PRECISION
@@ -657,6 +669,8 @@ void gotoblas_dynamic_init(void);
void gotoblas_dynamic_quit(void);
void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
int support_avx512(void);
#ifdef USE_OPENMP
@@ -668,7 +682,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#if (__STDC_VERSION__ >= 201112L)
#ifdef HAVE_C11
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic

View File

@@ -43,6 +43,7 @@
#define MB asm("mb")
#define WMB asm("wmb")
#define RMB asm("rmb")
static void __inline blas_lock(unsigned long *address){
#ifndef __DECC

View File

@@ -37,11 +37,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB
#define WMB
#define RMB
#else
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define RMB __asm__ __volatile__ ("dmb ish" : : : "memory")
#endif
@@ -121,7 +123,7 @@ REALNAME:
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#define BUFFER_SIZE (16 << 20)
#define BUFFER_SIZE (32 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

View File

@@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MB __asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB __asm__ __volatile__ ("dmb ishst" : : : "memory")
#define RMB __asm__ __volatile__ ("dmb ishld" : : : "memory")
#define INLINE inline
@@ -53,16 +53,16 @@ static void __inline blas_lock(volatile BLASULONG *address){
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"mov x4, #1 \n\t"
"sevl \n\t"
"1: \n\t"
"wfe \n\t"
"2: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"cbnz w3, 2b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
@@ -78,7 +78,20 @@ static void __inline blas_lock(volatile BLASULONG *address){
#define BLAS_LOCK_DEFINED
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
static __inline BLASULONG rpcc(void){
BLASULONG ret = 0;
blasint shift;
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
__asm__ __volatile__ ("mrs %0,cntfrq_el0; clz %w0, %w0":"=&r"(shift));
return ret << shift;
}
#define RPCC_DEFINED
#define RPCC64BIT
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
@@ -103,12 +116,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 4 ;\
.global REALNAME ;\
.type REALNAME, %function ;\
.macro PROLOGUE
.text ;
.p2align 2 ;
.global REALNAME ;
#ifndef __APPLE__
.type REALNAME, %function ;
#endif
REALNAME:
.endm
#define EPILOGUE
@@ -124,12 +141,17 @@ REALNAME:
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#ifndef BUFFERSIZE
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#elif defined(TSV110) || defined(EMAG8180)
#define BUFFER_SIZE (32 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#else
#define BUFFER_SIZE (32 << BUFFERSIZE)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

View File

@@ -47,6 +47,7 @@
#define MB
#define WMB
#define RMB
#ifdef __ECC
#include <ia64intrin.h>

View File

@@ -54,6 +54,11 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *);
double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *);
xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *);
void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *);
#ifdef RETURN_BY_STRUCT
typedef struct {
@@ -469,6 +474,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
/* Level 3 routines */
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,

Some files were not shown because too many files have changed in this diff Show More