Compare commits

...

507 Commits

Author SHA1 Message Date
Zhang Xianyi
5f998efd7b Merge pull request #1247 from martin-frbg/revert-cgroups-pr
Revert #1246, "honor cgroup/cpuset limits" for now
2017-07-25 16:09:55 +08:00
Martin Kroeker
88a35ff457 Revert #1246, "honor cgroup/cpuset limits" for now
Unsafe usage of the __GLIBC_PREREQ macro lead to build breakage on non-glibc systems
2017-07-25 08:39:35 +02:00
Zhang Xianyi
5dde4e65d3 Merge branch 'develop'
0.2.20 version
2017-07-24 12:03:35 +08:00
Zhang Xianyi
27a9df6477 Update doc for 0.2.20 version. 2017-07-24 11:55:10 +08:00
Zhang Xianyi
7224022473 Merge pull request #1239 from martin-frbg/cgroups
Honor cgroup/cpuset limits when enumerating cpus
2017-07-24 11:46:52 +08:00
Zhang Xianyi
468ac3df9e Merge pull request #1244 from martin-frbg/micmuc_cimatcopy
Fix complex imatcopy for Trans cases with non-square matrix
2017-07-24 11:45:27 +08:00
Martin Kroeker
376048156b Use in-place transform shortcut only if matrix is square 2017-07-21 11:20:15 +02:00
Martin Kroeker
d1c5b8f913 Add files via upload 2017-07-20 20:51:06 +02:00
Martin Kroeker
91bde7d315 Exchange rows and cols in final omatcopy with BlasTrans
This is MicMuc's patch from #899
2017-07-15 22:02:53 +02:00
Martin Kroeker
80373ea039 More fixes for silly misedits 2017-07-15 12:48:42 +02:00
Martin Kroeker
d12b75a6c4 Fixup braces lost in previous edit 2017-07-15 11:53:28 +02:00
Martin Kroeker
7294fb1d9d Merge branch 'develop' into cgroups 2017-07-15 10:40:42 +02:00
Martin Kroeker
31e086d6a6 Disable ReLAPACK by default (#1238)
* Disable ReLAPACK by default; mention it in final build message if included

* Add files via upload

* Add files via upload

* Add files via upload
2017-07-13 22:01:47 +02:00
Zhang Xianyi
cbb47736af Merge pull request #1214 from martin-frbg/relapack
Initial import of ReLAPACK
2017-07-13 20:31:08 +08:00
Zhang Xianyi
2a7c6930ac Merge pull request #1234 from brada4/develop
Fix write past fixed size buffer
2017-07-13 20:27:37 +08:00
Martin Kroeker
c4ec882020 Merge pull request #1235 from xianyi/revert-1233-cpuid-fix
Revert "Fix unintentional fall-through cases in get_cacheinfo"
2017-07-12 09:37:55 +02:00
Martin Kroeker
d33fc32cf3 Revert "Fix unintentional fall-through cases in get_cacheinfo" 2017-07-12 09:35:11 +02:00
Andrew
529bfc36ec Fix write past fixed size buffer 2017-07-12 00:59:30 +02:00
Martin Kroeker
88249ca5f7 Add files via upload 2017-07-11 18:48:13 +02:00
Martin Kroeker
731c518cff Add files via upload 2017-07-11 18:42:39 +02:00
Martin Kroeker
29fc429d9a Honor cgroup/cpuset constraints when enumerating cpus 2017-07-11 18:27:33 +02:00
Martin Kroeker
e2d3b1561a Merge pull request #1233 from martin-frbg/cpuid-fix
Fix unintentional fall-through cases in get_cacheinfo
2017-07-11 17:10:55 +02:00
Martin Kroeker
4a012c3d20 Fix unintentional fall-through cases in get_cacheinfo
These appear to be unintended side effects of PR #1091, probably causing #1232
2017-07-11 15:39:15 +02:00
Zhang Xianyi
d5ef0dee9a Merge pull request #1226 from ashwinyes/develop_arm_clang_ual_fix
arm: Fix clang compilation for ARMv7
2017-07-10 20:04:42 +08:00
Zhang Xianyi
a590e6135c Merge pull request #1221 from ashwinyes/develop_arm_softfp
arm: add support for softfp in arm vfp assembly files
2017-07-10 20:03:57 +08:00
Zhang Xianyi
4239dd65ce Merge branch 'develop' into develop_arm_softfp 2017-07-10 20:02:36 +08:00
Martin Kroeker
3db2adf872 Merge pull request #1230 from martin-frbg/rhel5
Add sched_getcpu implementation for pre-2.6 glibc
2017-07-09 13:16:16 +02:00
Martin Kroeker
ad2462811a Do not add -lpthread on Android builds (#1229)
* Do not add -lpthread on Android builds

* Do not add -lpthread on Android cmake builds
2017-07-09 13:15:24 +02:00
Martin Kroeker
c1cf62d2c0 Add sched_getcpu implementation for pre-2.6 glibc
Fixes #1210, compilation on RHEL5 with affinity enabled
2017-07-09 09:45:38 +02:00
Zhang Xianyi
bfe1656b8b Merge pull request #1225 from martin-frbg/stolen_from_wernsaar_fork
fixed syrk_thread.c taken from wernsaar
2017-07-07 15:43:33 +08:00
Ashwin Sekhar T K
f02d535fde arm: Fix clang compilation for ARMv7
clang is not recognizing some pre-UAL VFP mnemonics like fnmacs, fnmacd,
fnmuls and fnmuld. Replaced them with equivalent UAL mnemonics which are
vmls.f32, vmls.f64, vnmul.f32 and vnmul.f64 respectively.
2017-07-07 12:35:58 +05:30
Martin Kroeker
49e62c0e77 fixed syrk_thread.c taken from wernsaar
Stride calculation fix copied from https://github.com/wernsaar/OpenBLAS/commit/88900e1
2017-07-06 17:30:12 +02:00
Martin Kroeker
3381f23709 Handle different object extensions in Makefile
The optimized LAPACK functions from interface use OS-dependent suffixes .o/.obj for the object files, while netlib LAPACK uses .o throughout. ReLAPACK object names have to match in order for function replacement in the growing library file to work.
2017-07-06 10:12:00 +02:00
Zhang Xianyi
fa6a920caa Link -lm or -lm_hard for Android ARMv7. 2017-07-05 17:05:06 +08:00
Zhang Xianyi
a6515bb858 Merge pull request #1218 from m-brow/power9
Optimise loads on Power9 LE
2017-07-03 13:48:29 +08:00
Zhang Xianyi
c66b842d66 Merge pull request #1212 from neilsh-msft/develop
Add Microsoft Windows 10 UWP build support
2017-07-03 13:43:48 +08:00
Martin Kroeker
df2dfe65d6 Update Makefile 2017-07-02 01:46:23 +02:00
Martin Kroeker
2c8d634619 Add files via upload 2017-07-02 00:50:14 +02:00
Ashwin Sekhar T K
37efb5bc1d arm: Remove unnecessary files/code
Since softfp code has been added to all required vfp kernels,
the code for auto detection of abi is no longer required.

The option to force softfp ABI on make command line by giving
ARM_SOFTFP_ABI=1 is retained. But there is no need to give this option
anymore.

Also the newly added C versions of 4x4/4x2 gemm/trmm kernels are removed.
These are longer required. Moreover these kernels has bugs.
2017-07-02 03:06:36 +05:30
Ashwin Sekhar T K
97d671eb61 arm: add softfp support in zgemm/ztrmm vfp kernels 2017-07-02 02:54:32 +05:30
Ashwin Sekhar T K
305cd2e8b4 arm: add softfp support in cgemm/ctrmm vfp kernels 2017-07-02 02:42:32 +05:30
Ashwin Sekhar T K
09bc6ebe5b arm: add softfp support in dgemm/dtrmm vfp kernels 2017-07-02 02:24:38 +05:30
Ashwin Sekhar T K
872a11a2bf arm: add softfp support in sgemm/strmm vfp kernels 2017-07-02 02:23:48 +05:30
Ashwin Sekhar T K
eda9e8632a generic: Bug fixes in generic 4x2 and 4x4 gemm kernels 2017-07-02 02:00:48 +05:30
Ashwin Sekhar T K
8f83d3f961 arm: add softfp support in vfp gemv kernels 2017-07-02 01:03:31 +05:30
Martin Kroeker
e5e47cfdb5 Merge pull request #1220 from ashwinyes/develop_aarch64_20170701_t99_options
arm64: Change mtune/mcpu options for THUNDERX2T99 target
2017-07-01 20:43:23 +02:00
Ashwin Sekhar T K
ebf9e9dabe arm64: Change mtune/mcpu options for THUNDERX2T99 target 2017-07-01 11:17:10 -07:00
Ashwin Sekhar T K
83bd547517 arm: add softfp support in kernel/arm/swap_vfp.S 2017-07-01 20:37:40 +05:30
Ashwin Sekhar T K
e25f4c01d6 arm: add softfp support in kernel/arm/nrm2_vfp*.S 2017-07-01 19:57:28 +05:30
Ashwin Sekhar T K
54915ce343 arm: add softfp support in kernel/arm/*dot_vfp.S 2017-06-30 23:46:02 +05:30
Ashwin Sekhar T K
0150fabdb6 arm: add softfp support in kernel/arm/rot_vfp.S 2017-06-30 21:52:32 +05:30
Ashwin Sekhar T K
4f0773f07d arm: add softfp support in kernel/arm/axpy_vfp.S 2017-06-30 20:25:59 +05:30
Ashwin Sekhar T K
aa5edebc80 arm: add softfp support in kernel/arm/asum_vfp.S 2017-06-30 18:21:05 +05:30
Ashwin Sekhar T K
89924b3d5b arm: Use assembly implementations based on the ARM abi
In case of softfp abi, assembly implementations of only those APIs are
used which doesnt have a floating point argument or return value.

In case of hard abi, all assembly implementations are used.
2017-06-30 18:21:05 +05:30
Ashwin Sekhar T K
da7f0ff425 generic: add some generic gemm and trmm kernels
Added generic 4x4 and 4x2 gemm kernels
Added generic 4x2 trmm kernel
2017-06-30 18:21:05 +05:30
Ashwin Sekhar T K
0d5c8e5386 arm: Determine the abi from compiler if not specified on command line
If ARM abi is not explicitly mentioned on the command line, then set the
arm abi to softfp or hard according to the compiler environment.
This assumes that compiler sets the defines __ARM_PCS and __ARM_PCS_VFP
accordingly.
2017-06-30 18:20:59 +05:30
Martin Kroeker
912410f214 Add ReLAPACK to Makefiles 2017-06-28 18:15:21 +02:00
Martin Kroeker
b122413fb0 Restore ReLAPACK test folder 2017-06-28 18:13:14 +02:00
Martin Kroeker
9b7b5f7fdc Add Elmar Peise's ReLAPACK 2017-06-28 17:38:41 +02:00
Neil Shipp
34513be726 Add Microsoft Windows 10 UWP build support 2017-06-23 13:07:34 -07:00
Zhang Xianyi
482015f8d6 Merge branch 'arm_soft_fp_abi' into develop 2017-06-23 11:35:25 +08:00
Zhang Xianyi
639000e34f Merge pull request #1211 from neilsh-msft/develop
Add 64bit support for Microsoft Visual Studio
2017-06-23 11:33:09 +08:00
Neil Shipp
5de7727cc7 Reorder dependencies to allow in-place build to succeed the first time. 2017-06-22 18:05:19 -07:00
Neil Shipp
96df4b9b17 Avoid truncating cblas.h when compiling gencblas target 2017-06-22 17:08:09 -07:00
Neil Shipp
29dc8e0c61 Revert changes to sed and awk 2017-06-21 17:49:57 -07:00
Neil Shipp
65e56cb29d Add 64bit support for Microsoft Visual Studio 2017-06-21 13:38:22 -07:00
Matt Brown
bd831a03a8 Optimise sscal for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 17:02:46 +10:00
Matt Brown
edc97918f8 Optimise srot for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 17:02:35 +10:00
Matt Brown
e0034de22d Optimise sdot for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 17:02:19 +10:00
Matt Brown
32c7fe6bff Optimise sasum for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 17:02:10 +10:00
Matt Brown
19bdf9d52b Optimise casum for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 17:00:07 +10:00
Matt Brown
4f09030fdc Optimise cswap for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 16:59:53 +10:00
Matt Brown
6f4eca5ea4 Optimise sswap for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 16:59:13 +10:00
Matt Brown
be55f96cbd Optimise scopy for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 16:59:13 +10:00
Matt Brown
96dd0ef4f7 Optimise ccopy for POWER9
Use lxvd2x instruction instead of lxvw4x.
lxvd2x performs far better on the new POWER architecture than lxvw4x.
2017-06-14 16:58:59 +10:00
Martin Kroeker
8f0d6c06a9 Fix installation of header files with cmake (#1186)
* Fix installation of header files with cmake 

Install only the required header files, with openblas_config.h preprocessed like in Makefile.install
Fixes #1184

* Update CMakeLists.txt

Escape remaining semicolons in awk argument list (to get it working on Windows as well)

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Add files via upload

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

see if it is the single quotes that cause the problem on windows

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Update CMakeLists.txt

* Use C utility instead of awk for header generation in cmake builds

* Update CMakeLists.txt

* Fix generation and installation of header files

Generate openblas_config.h and f77blas.h with same contents as in plain Makefile builds and install only the public header files
2017-06-01 16:36:26 +02:00
Martin Kroeker
410a07cbec Merge pull request #1190 from oviradoi/utest_make_complex
Update test to use openblas_make_complex_float and openblas_make_comp…
2017-06-01 16:35:52 +02:00
Ovidiu Radoi
72f95a0acc Update test to use openblas_make_complex_float and openblas_make_complex_double functions 2017-05-30 12:12:49 +03:00
Martin Kroeker
e545b81e76 Merge pull request #1189 from pawosm-arm/flang
build: Flang has the same interface as PGI
2017-05-28 11:07:57 +02:00
Paul Osmialowski
d7afdf9137 build: Flang has the same interface as PGI
Signed-off-by: Paul Osmialowski <pawel.osmialowski@arm.com>
2017-05-27 06:26:48 +01:00
Martin Kroeker
4f4daaa42a Merge pull request #1188 from pawosm-arm/flang
build: Flang compiler support
2017-05-26 23:02:47 +02:00
Paul Osmialowski
42bbe74791 build: LLVM: Add Flang compiler support and enable OpenMP for Clang
Signed-off-by: Paul Osmialowski <pawel.osmialowski@arm.com>
2017-05-25 17:03:20 +01:00
Zhang Xianyi
c8322c65e4 Merge pull request #1187 from mine260309/develop
build: fix libxlmass errors building on Power CPU
2017-05-24 15:54:58 +08:00
Lei YU
87dde1fde6 build: fix libxlmass errors building on Power CPU
IBM MASS library is upgraded to 8.1.5 and 8.1.3 is not available.
Update README.md and Makefile.power to use version 8.1.5 of libxlmass.
2017-05-24 14:51:52 +08:00
Martin Kroeker
42466e54fa Merge pull request #1182 from martin-frbg/martin-frbg-patch-1
Build shared library on Android without SONAME versioning
2017-05-10 19:39:09 +02:00
Martin Kroeker
3b0624d50f Build shared library on Android without SONAME versioning
Android does not support versioned SONAME entries, ref. #1173
2017-05-10 13:08:13 +02:00
Martin Kroeker
fd4e68128e Merge pull request #1178 from jcowgill/mips-fixes
MIPS threading fixes
2017-05-06 17:20:10 +02:00
Martin Kroeker
6464d1723a Merge pull request #1179 from jcowgill/memory-fixes
Fixes to driver/others/memory.c
2017-05-06 13:08:46 +02:00
James Cowgill
59c97cfee4 memory: Fix buffer overflow when position == NUM_BUFFERS 2017-05-05 17:47:03 +01:00
James Cowgill
de7875ca5d mips: remove incorrect blas_lock implementations
MIPS 32-bit currently has an empty blas_lock implementation which is
worse than nothing at all. MIPS 64-bit does has a blas_lock
implementation but is broken. Remove them and fallback to the generic
version in common.h which should do the right thing on MIPS.
2017-05-05 17:28:03 +01:00
James Cowgill
67836c2ab4 mips: implement MB and WMB
The MIPS architecture has weak memory ordering and therefore requires
sutible memory barriers when doing lock free programming with multiple
threads (just like ARM does). This commit implements those barriers for
MIPS and MIPS64 using GCC bultins which is probably easiest way.
2017-05-05 17:14:03 +01:00
James Cowgill
5fecfe0f42 memory: switch loop condition around in blas_memory_free
Before this commit, the "position < NUM_BUFFERS" loop condition from
blas_memory_free will be completely optimized away by GCC. This is
because the condition can only be false after undefined behavior has
already been invoked (reading past the end of an array). As a
consequence of this bug, GCC also removes the subsequent if statement
and all the code after the error label because all of it is dead.

This commit switches the loop condition around so it works as intended.
2017-05-05 16:01:58 +01:00
Martin Kroeker
bba6676803 Merge pull request #1175 from martin-frbg/lapack_143
Fix workspace computation in LAPACKE ?tpmqrt
2017-05-05 12:00:04 +02:00
Martin Kroeker
5649b2c53a Merge pull request #1176 from staticfloat/sf/dynamic_arch
Fix DYNAMIC_ARCH=1 breaking builds on non-x86 platforms
2017-05-05 11:59:41 +02:00
Elliot Saba
6e972994b2 Force DYNAMIC_ARCH to empty when DYNAMIC_CORE is not set 2017-05-04 12:55:31 -07:00
Elliot Saba
5b04cf7ab4 Add Makefile debugging trick so that we can inspect runtime Makefile variables 2017-05-04 11:51:29 -07:00
Martin Kroeker
d5ea8fd823 Fix workspace computation for side=L
From netlib PR#144
2017-05-04 20:01:41 +02:00
Martin Kroeker
4beffaaa4b Fix workspace computation for side=L
From netlib PR#144
2017-05-04 19:59:02 +02:00
Martin Kroeker
fb28e4adc9 Fix workspace computation for side=L
From netlib PR#144
2017-05-04 19:55:02 +02:00
Martin Kroeker
26faa3ca47 Fix workspace allocation in lapacke_ctp for side=L
from netlib PR #144
2017-05-04 19:49:51 +02:00
Martin Kroeker
4f75989634 Merge pull request #1169 from martin-frbg/cblas_xerbla
Add trivial implementation of cblas_xerbla
2017-05-04 19:32:50 +02:00
Martin Kroeker
1e06b49854 Update xerbla.c 2017-04-26 20:29:30 +02:00
Martin Kroeker
7f546f54fa Add cblas_xerbla 2017-04-26 20:01:34 +02:00
Martin Kroeker
a809431e34 Add cblas_xerbla() 2017-04-26 19:58:59 +02:00
Martin Kroeker
5ee1cf0223 Merge pull request #1165 from rcoscali/patch-1
README.md update
2017-04-21 15:14:16 +02:00
Rémi Cohen-Scali
9aea7a0d9a Update README.md 2017-04-21 14:18:57 +02:00
Martin Kroeker
da0987507c Merge pull request #1164 from sharkcz/s390x
detect CPU on zArch
2017-04-21 10:53:49 +02:00
Dan Horák
81fed55782 detect CPU on zArch 2017-04-20 21:13:41 +02:00
Martin Kroeker
35387edb8d Merge pull request #1160 from gcp/extra-streamroller-cpuid
Add an extra familiy/model combination used by AMD Steamrolller.
2017-04-19 20:03:23 +02:00
Gian-Carlo Pascutto
9c884986ad Add an extra familiy/model combination used by AMD Steamrolller (Godavari). 2017-04-19 19:15:47 +02:00
Martin Kroeker
f2f0e98bb5 Merge pull request #1158 from martin-frbg/force-zen
Make FORCE_ZEN option in getarch.c actually set target names to ZEN
2017-04-19 15:04:41 +02:00
Martin Kroeker
166d64eb7c Fix FORCE_ZEN option in getarch.c 2017-04-19 14:20:42 +02:00
Martin Kroeker
e078339e8d Merge pull request #1157 from gcp/revert-zen-param
Revert Zen param.h to Haswell values (instead of Excavator).
2017-04-18 13:32:16 +02:00
Gian-Carlo Pascutto
832a272784 Revert Zen param.h to Haswell values (instead of Excavator). 2017-04-18 12:40:25 +02:00
Martin Kroeker
356606314c Merge pull request #1156 from SoapGentoo/cmake-fixes
Use GNUInstallDirs to allow changing target directories
2017-04-18 09:00:24 +02:00
David Seifert
ed79a29d87 Use GNUInstallDirs to allow changing target directories
* Multi-lib distributions need to change the libdir
  which is only portably possible with `GNUInstallDirs`.
* Multi-arch distributions such as Debian and Exherbo
  need to be able to change the bindir.
2017-04-16 00:43:47 +02:00
Martin Kroeker
77d16ffc69 Merge pull request #1154 from sharkcz/s390x
add lapack laswp directory for zarch
2017-04-13 16:37:29 +02:00
Dan Horák
56762d5e4c add lapack laswp for zarch 2017-04-13 15:38:59 +02:00
Zhang Xianyi
90dd190a6d Build shared library for Android. 2017-04-11 12:01:18 +08:00
Martin Kroeker
ab9ec4ab4e Merge pull request #1148 from gcp/fix-dynamic-zen
Fix dynamic detection for ZEN CPUs.
2017-04-10 20:17:14 +02:00
Gian-Carlo Pascutto
0cbd2d34e4 Recognize ZEN when passed as OPENBLAS_CORETYPE. 2017-04-10 20:05:16 +02:00
Gian-Carlo Pascutto
62979fd104 Fix dynamic detection for ZEN CPUs. 2017-04-10 19:08:37 +02:00
Martin Kroeker
20a413e154 Merge pull request #1142 from amodra/develop
Power8 inline assembly tweaks
2017-04-06 16:20:01 +02:00
Alan Modra
dc40bc7368 Power8 inline assembly tweaks
Further fixes on top of 9e2f316ed.  Writing some doco for gcc on
inline assembly woke me up to some more errors.

- dgemv_kernel_4x4 asm did not mention *ap as a memory input, and
  *y is both read and write.
- sasum_kernel_32 and casum_kernel_16 did not use %x for a vsx insn
  operand, a problem if the "=f" sum output was ever allocated a vsx
  reg in the altivec set.  This might be possible with inlining and
  future gcc optimisation.
2017-04-04 23:13:54 +09:30
Martin Kroeker
1acfc78c8f Merge pull request #1140 from JohannesBuchner/develop
Autodetect AMD A8-6410 as BARCELONA
2017-04-03 09:47:09 +02:00
Johannes Buchner
b4071d0d16 Autodetect AMD A8-6410 as BARCELONA 2017-04-03 17:07:27 +10:00
Martin Kroeker
7908efafc8 Fix integer overflow in LAPACK DBDSQR, SBDSQR (#1135)
* Fix integer overflow in DBDSQR

As noted in lapack issue 135, an integer overflow in the calculation of the iteration limit could lead to an immediate return without any iterations having been performed if the input matrix is sufficiently big.

* Fix integer overflow in SBDSQR

As noted in lapack issue 135, an integer overflow in the calculation of the iteration limit could lead to an immediate return without any iterations having been performed if the input matrix is sufficiently big.

* Fix integer overflow in threshold calculation

Related to lapack issue 135, the threshold calculation can overflow as well as the multiplication is evaluated from left to right.
Without explicit parentheses, the calculation would overflow for N >= 18919

* Fix integer overflow in threshold calculation

Related to lapack issue 135, the threshold calculation can overflow as well as the multiplication is evaluated from left to right.
Without explicit parentheses, the calculation would overflow for N >= 18919
2017-03-24 22:05:22 +01:00
Martin Kroeker
66dc10b019 Merge pull request #1133 from steckdenis/develop
Add ZEN support
2017-03-24 13:47:32 +01:00
Zhang Xianyi
b5c96fcfcd Support ARM SOFTFP ABI for saxpy, sdot, snrm2, sscal, sgemv, sger. 2017-03-20 17:39:25 +08:00
Denis Steckelmacher
c9ff735da6 Add ZEN support (tested for auto-detected static backend) 2017-03-19 15:32:50 +01:00
Andrew
99880f7906 Address unlikely memleak in zimatcopy interface (#1129)
* fix unlikely memleak in zimatcopy interface

* fix only unlikely memleak in zimatcopy interface

* fix only unlikely memleak in zimatcopy interface
2017-03-16 13:13:31 +01:00
Martin Kroeker
cd135e2b59 Merge pull request #1130 from quickwritereader/develop
Blas 3 for single precision
2017-03-15 10:00:52 +01:00
Martin Kroeker
ad124a5e8b Merge pull request #1126 from martin-frbg/pgi
Fix compilation with PGI by replacing verbatim _real_, _imag_ extensions and updating macro definitions for modern, C99-capable versions of the PGI compiler
2017-03-14 17:17:39 +01:00
Martin Kroeker
211d2eceb5 Update zdot.c 2017-03-13 18:08:00 +01:00
Martin Kroeker
5813ed095b Update zdot.c 2017-03-13 17:49:07 +01:00
Martin Kroeker
e44b028fe5 Replace gnu _real_, _imag_ extensions in initializers 2017-03-13 00:40:11 +01:00
Martin Kroeker
a6efabf155 Replace gnu _real_ , _imag_ extensions in initializers 2017-03-13 00:38:37 +01:00
Martin Kroeker
ea26b00c06 Fix CREAL,CIMAG macros for PGI 2017-03-13 00:36:01 +01:00
Abdurrauf
08786c4b95 strmm and ctrmm 2017-03-13 01:23:16 +04:00
Martin Kroeker
12e476f7a2 Merge pull request #1124 from martin-frbg/c_check-ppc
Update c_check.cmake to label ppc64 as power ARCH
2017-03-10 12:58:38 +01:00
Martin Kroeker
8de40955ad Update c_check.cmake 2017-03-10 11:45:48 +01:00
Martin Kroeker
9b24688eed Merge pull request #1122 from martin-frbg/zlasyf
Fix misspelling of zlasyf_aa from previous commit
2017-03-10 09:51:34 +01:00
Martin Kroeker
43224f7273 Fix misspelling of zlasyf_aa from previous commit 2017-03-10 08:44:49 +01:00
Martin Kroeker
9254a701f3 Merge pull request #1121 from staticfloat/sf/Xsymv_export
Add `csymv` and `zsymv` into `@lapackobjs2` for exporting
2017-03-10 08:33:36 +01:00
Elliot Saba
26a614fdd1 Whitespace cleanup/reformatting 2017-03-09 15:30:43 -08:00
Elliot Saba
7ae64f4f9c Add csymv and zsymv into @lapackobjs2 for exporting 2017-03-09 15:22:40 -08:00
Zhang Xianyi
90e02ccf68 Support ARM softfp ABI for sgemm on ARMV7.
make ARM_SOFTFP_ABI=1
2017-03-06 22:16:13 +08:00
Zhang Xianyi
503dcbfde6 Merge branch 'develop' into arm_soft_fp_abi 2017-03-06 13:53:56 +08:00
Abdurrauf
82e80fa82b initial strmm(sgemm). not tuned yet 2017-03-06 04:27:40 +04:00
Martin Kroeker
4227049c7d Merge pull request #1111 from martin-frbg/kaby-no-avx
Fix core detection for Kaby Lake without AVX (G4560)
2017-03-02 18:43:59 +01:00
Martin Kroeker
688267edf3 Fix core detection for Kaby Lake without AVX (G4560)
Should fix #1109)
2017-03-02 17:36:16 +01:00
Martin Kroeker
d1fe040d9b Merge pull request #1110 from quickwritereader/develop
Conventional usage of the register save area.
2017-03-01 23:08:07 +01:00
Abdurrauf
411982715c conventional usage of the register save area 2017-03-01 20:39:39 +04:00
Abdurrauf
e831d6924e changed to conventional register save area 2017-03-01 03:13:21 +04:00
Martin Kroeker
ffc1d6c468 Merge pull request #1108 from ashwinyes/develop_20170203_thunderx2t99
Optimized Implementations for ThunderX2T99
2017-02-28 16:02:19 +01:00
Ashwin Sekhar T K
a86474c6f7 THUNDERX2T99: Performance fix for ZGEMM 2017-02-28 06:05:00 -08:00
Ashwin Sekhar T K
67473d09dd THUNDERX2T99: Bug Fixes in D/Z NRM2 and ZGEMM 2017-02-28 01:11:38 -08:00
Ashwin Sekhar T K
19ba133383 THUNDERX2T99: Add Optimized ZGEMM Implementation 2017-02-28 05:31:41 +00:00
Martin Kroeker
f09a9afa03 Merge pull request #1107 from quickwritereader/develop
ztrmm(zgemm) complex double precision kernel for ibm z13
2017-02-26 09:49:01 +01:00
Abdurrauf
0d96b0e2a7 Merge branch 'z13' into develop 2017-02-26 06:17:33 +04:00
Abdurrauf
848cb27b1e ztrmm kernel. 2017-02-26 06:14:12 +04:00
Martin Kroeker
dc34a0da96 Merge pull request #915 from mdong/small_fix_for_icc
remove input from clobbered list
2017-02-23 20:00:22 +01:00
Ashwin Sekhar T K
a3935f0dfb THUNDERX2T99: Add Optimized D/Z NRM2 Implementation 2017-02-23 10:02:15 -08:00
Martin Kroeker
47e9fe0bb4 Merge pull request #1105 from martin-frbg/testing-eig-typos
TESTING/EIG: fix spurious EXTERNAL references to nonexistent functions
2017-02-22 22:42:52 +01:00
Martin Kroeker
c7bc0ee823 Remove spurious names from EXTERNAL list
Remove unused (and nonexistent) functions ZHETRD_SY2SB and ZHETRD_SB2ST from comment and EXTERNAL declaration
2017-02-22 21:48:35 +01:00
Martin Kroeker
6bdee6d50a Remove spurious names from EXTERNAL list
Remove unused (and nonexistent) ZHETRD_SY2SB and ZHETRD_SB2ST
2017-02-22 21:45:27 +01:00
Martin Kroeker
009c0d2e5a Fix typo in EXTERNAL declaration
ZHBTRD_HB2ST  should be ZHETRD_HB2ST
2017-02-22 21:41:07 +01:00
Martin Kroeker
4d88e1a4ad Merge pull request #1104 from martin-frbg/lapack-comma
LAPACK: fix missing comma on continued lines
2017-02-22 10:31:39 +01:00
Martin Kroeker
0958b49811 Fix missing comma on continued line
EXTERNAL declaration of subroutines missed a comma before the continuation line,
causing a strange run-together name to appear in the object when compiled with ifort.
2017-02-22 08:40:39 +01:00
Martin Kroeker
09b240f1ef Fix missing comma on continued line
EXTERNAL declaration of subroutines missed a comma before the continuation line,
causing a strange run-together name to appear in the object when compiled with ifort.
2017-02-22 08:39:06 +01:00
Martin Kroeker
69f4e8b86c Fix missing comma on continued line
EXTERNAL declaration of subroutines missed a comma before the continuation line,
causing a strange run-together name to appear in the object when compiled with ifort.
2017-02-22 08:34:20 +01:00
Martin Kroeker
e072e68aa0 Fix missing comma in continued line
EXTERNAL declaration of subroutines missed a comma before the continuation line,
causing a strange run-together name to appear in the object when compiled with ifort.
2017-02-22 08:32:20 +01:00
Ashwin Sekhar T K
738628e9a8 ARM64: Remove unused code 2017-02-21 21:42:32 -08:00
Martin Kroeker
e527dbffaa Merge pull request #1103 from vladimir-ch/fix-lapacke-ormbr
LAPACKE: fix wrong matrix size in ?ormbr
2017-02-21 22:58:30 +01:00
Vladimir Chalupecky
eeaee46e86 LAPACKE: fix wrong matrix size in ?ormbr
Changes made upstream in Reference LAPACK in
https://github.com/Reference-LAPACK/lapack/pull/128
2017-02-21 21:57:18 +01:00
Martin Kroeker
040672ecf6 Merge pull request #1098 from martin-frbg/amodra-power8
Power8 inline assembly fixes
2017-02-21 15:26:14 +01:00
Martin Kroeker
c8ce9e4377 Merge pull request #1101 from martin-frbg/martin-frbg-patch-1
LAPACKE: fix wrong number of columns in ?ormlq
2017-02-21 15:19:56 +01:00
Ashwin Sekhar T K
ab3ffab96a THUNDERX2T99: Add Optimized C/Z DOT Implementation 2017-02-21 03:40:59 -08:00
Ashwin Sekhar T K
f036be9ce2 THUNDERX2T99: Add Optimized SDOT Implementation 2017-02-21 03:24:32 -08:00
Martin Kroeker
39eecfd20c Merge pull request #1102 from brada4/develop
Correct Apollo Lake CPUID identification in dynamic_arch builds
2017-02-21 08:26:39 +01:00
Andrew
5088523786 detect apollo lake for real 2017-02-20 23:54:59 +01:00
Martin Kroeker
3f7720ec4b LAPACKE: fix wrong number of columns in ?ormlq
Copied from lapack https://github.com/Reference-LAPACK/lapack/pull/127  by vladimir-ch (with earlier changes from echeresh's  
PR 115 "lapacke_*ormlq_work: move declarations under if" there as they touched some of the same files)
2017-02-20 16:20:43 +01:00
Ashwin Sekhar T K
faba876fda THUNDERX2T99: Bug fix in C/Z IAMAX 2017-02-19 23:11:50 -08:00
Ashwin Sekhar T K
172a62d73e THUNDERX2T99: Add Optimized C/Z IAMAX Implementation 2017-02-17 03:06:32 -08:00
Martin Kroeker
e545a66a5b Merge pull request #1091 from staticfloat/sf/corei5_7600k
CPUID mappings for Core i5-7600K (Kaby Lake)
2017-02-17 10:30:09 +01:00
Ashwin Sekhar T K
228c75a69c THUNDERX2T99: Add parallel SCNRM2 Implementation 2017-02-14 04:10:06 -08:00
Martin Kroeker
9e2f316ede Power8 inline assembly fixes
Quoting patch author amodra from #1078
Lots of issues here.
- The vsx regs weren't listed as clobbered.
- Poor choice of vsx regs, which along with the lack of clobbers led to
  trashing v0..v21 and fr14..fr23.  Ideally you'd let gcc choose all
  temp vsx regs, but asms currently have a limit of 30 i/o parms.
- Other regs were clobbered unnecessarily, seemingly in an attempt to
  clobber inputs, with gcc-7 complaining about the clobber of r2.
  (Changed inputs should be also listed as outputs or as an i/o.)
- "r" constraint used instead of "b" for gprs used in insns where the
  r0 encoding means zero rather than r0.
- There were unused asm inputs too.
- All memory was clobbered rather than hooking up memory outputs with
  proper memory constraints, and that and the lack of proper memory
  input constraints meant the asms needed to be volatile and their
  containing function noinline.
- Some parameters were being passed unnecessarily via memory.
- When a copy of a
2017-02-13 23:38:50 +01:00
Martin Kroeker
e2489c9a92 Merge pull request #1096 from martin-frbg/pkg-config
Build only openblas.pc for pkg-config and install it from cmake as well
2017-02-12 17:00:17 +01:00
Martin Kroeker
c4ea9eea67 Add cmake template for openblas.pc 2017-02-12 14:38:32 +01:00
Martin Kroeker
cd8f80634f Create and install openblas.pc in cmake builds 2017-02-12 14:37:33 +01:00
Martin Kroeker
faf06f0d8b Create and install only a single openblas.pc file 2017-02-12 14:35:48 +01:00
Martin Kroeker
c6fa4aef0c Rename blas.pc.in to openblas.pc.in 2017-02-12 14:34:03 +01:00
Martin Kroeker
1029dcd60d Merge pull request #1095 from martin-frbg/lapack370-cmake
Update cmakefiles for netlib 3.7.0
2017-02-12 14:30:29 +01:00
Martin Kroeker
d12c8bbcbb Add zlasyf_aa to lapack.cmake 2017-02-12 13:49:49 +01:00
Martin Kroeker
15f0d65010 Add another bunch of lapack 3.7 functions to cmake list 2017-02-12 01:59:30 +01:00
Martin Kroeker
7d831af1ba Add LAPACK 3.7 files not mentioned in announcement 2017-02-12 01:37:35 +01:00
Martin Kroeker
ee3e87cf46 Update cmake file list for lapacke 3.7.0 2017-02-12 00:40:16 +01:00
Martin Kroeker
8772c00bb0 Update cmake file list for lapack 3.7.0 2017-02-11 23:11:26 +01:00
Martin Kroeker
0a4a7e18f6 Merge pull request #1094 from martin-frbg/cmake-1
Update cmakefiles with changes from netlib 3.6.1
2017-02-11 20:48:41 +01:00
Martin Kroeker
357ef3cd8c Reflect name change of lapacke_mangling.h template 2017-02-11 19:56:02 +01:00
Martin Kroeker
002e646476 Add new functions from LAPACK 3.6.1 2017-02-11 19:54:02 +01:00
Martin Kroeker
3dad87bbb5 Merge pull request #1093 from martin-frbg/restore-cmakeinstall
Restore cmake install target
2017-02-11 17:41:39 +01:00
Martin Kroeker
bdd51cdabc Add cmake install target
Add CMAKE install target (based on patch provided by PrimarchOfTheSpaceWolves in #957)
This was originally merged as 988 but accidentally reverted by my subsequent PR the following day
2017-02-11 16:43:46 +01:00
Elliot Saba
1d8ab99e09 Add exfamily == 9 case (Kaby Lake) to dynamic arch detection 2017-02-10 15:23:55 -08:00
Elliot Saba
04b2b06665 CPUID mappings for Core i5-7600K (Kaby Lake) 2017-02-10 14:53:15 -08:00
Martin Kroeker
8a83daf4bf Merge pull request #1084 from isuruf/develop
Install pkg-config files
2017-02-08 01:01:18 +01:00
Martin Kroeker
39abb079fb Merge pull request #1087 from grisuthedragon/enable-a12
Enable EXCAVATOR kernels for A12-9800
2017-02-08 01:00:32 +01:00
Martin Koehler
76c6e33e54 Enable EXCAVATOR kernels for A12-9800 2017-02-07 21:38:28 +01:00
Martin Kroeker
a9594e8072 Merge pull request #1085 from vladimir-ch/lapacke_laswp_work
LAPACKE: fix incorrect value of lda_t in lapacke_?laswp_work
2017-02-07 11:40:41 +01:00
Ashwin Sekhar T K
8e89668f62 THUNDERX2T99: Fix bug in SNRM2 2017-02-07 02:14:33 -08:00
Ashwin Sekhar T K
f63deae9de THUNDERX2T99: Add Optimized S/D IAMAX Implementation 2017-02-07 01:35:55 -08:00
Vladimir Chalupecky
4c2b713ce5 LAPACKE: fix incorrect value of lda_t in lapacke_?laswp_work
Fixed in Reference LAPACK in commit:

07e1fbd897
2017-02-07 09:21:46 +01:00
Isuru Fernando
cdc954675c Install pkg-config files 2017-02-06 12:15:58 +05:30
Martin Kroeker
60eea75409 Merge pull request #1076 from ashwinyes/develop_20170130_thunderx2t99
More optimized implementations for ThunderX2T99
2017-02-04 17:25:43 +01:00
Ashwin Sekhar T K
071a830e8b THUNDERX2T99: Add optimized S/D/C/Z SWAP Implementations 2017-02-03 03:55:06 -08:00
Ashwin Sekhar T K
d09f88192c THUNDERX2T99: Add optimized S/D/C/Z COPY Implementations 2017-02-02 15:26:38 +05:30
Ashwin Sekhar T K
e58233460a THUDNERX2T99: Add optimized D/C/Z ASUM Implementations 2017-02-02 15:26:22 +05:30
Ashwin Sekhar T K
3918d17025 LAPACK: Fix lapack-test errors in ARM64 threaded version 2017-01-31 23:36:23 +05:30
Ashwin Sekhar T K
99bd2892bf THUNDERX2T99: Add optimized CASUM Implementation 2017-01-30 17:44:32 +05:30
Ashwin Sekhar T K
ff6f572f2e THUNDERX2T99: Rename labels in for DDOT and SNRM2 2017-01-30 17:44:32 +05:30
Ashwin Sekhar T K
e0dc5f58c5 THUNDERX2T99: Remove Duplicate Code 2017-01-30 17:44:32 +05:30
Ashwin Sekhar T K
2757b49767 THUNDERX2T99: Add Optimized CGEMM Implementation 2017-01-30 17:44:26 +05:30
Zhang Xianyi
ff41e13385 Merge pull request #1074 from ashwinyes/develop_20170116_thunderx2t99_sgemm
Add more THUNDERX2T99 Optimized APIs
2017-01-25 22:17:05 +08:00
Ashwin Sekhar T K
1de6fa0f50 Update .gitignore 2017-01-24 23:14:09 -08:00
Ashwin Sekhar T K
efda640723 Benchmark: Add MFlops print in iamax benchmark 2017-01-24 23:13:47 -08:00
Ashwin Sekhar T K
1530e78cfe Benchmarks: Avoid building lapack benchmarks when NO_LAPACK=1 2017-01-24 20:50:23 -08:00
Ashwin Sekhar T K
907e286eb6 THUNDERX2T99: Add threaded SNRM2 Implementation 2017-01-24 21:39:29 +05:30
Ashwin Sekhar T K
cde3aee08b ARM64: Rename kernel files to have consistent naming 2017-01-24 14:53:34 +05:30
Ashwin Sekhar T K
ee6ea7e988 THUNDERX2T99: Add Optimized CNRM2 Implementation 2017-01-24 10:23:32 +05:30
Ashwin Sekhar T K
ca0b36b012 THUNDERX2T99: Add Optimized SNRM2 Implementation 2017-01-24 10:23:21 +05:30
Ashwin Sekhar T K
01e1d85339 Update .gitignore 2017-01-19 11:58:59 +05:30
Ashwin Sekhar T K
d0a79ca6e0 THUNDERX2T99: Add threaded DDOT Implementation 2017-01-19 11:11:42 +05:30
Ashwin Sekhar T K
0c07003ccf THUNDERX2T99: Add Optimized DDOT Implementation 2017-01-19 11:11:07 +05:30
Ashwin Sekhar T K
f33fcedb30 THUNDERX2T99: Improve SGEMM 2017-01-19 11:11:07 +05:30
Ashwin Sekhar T K
0f1d6e8b39 THUNDERX2T99: Improve DGEMM 2017-01-19 11:11:07 +05:30
Ashwin Sekhar T K
981064acc6 THUNDERX2T99: Add Optimized DAXPY Implementation 2017-01-19 11:10:57 +05:30
Zhang Xianyi
ab2033f2db Merge pull request #1068 from sva-img/develop
Added MSA optimised rot functions.
2017-01-17 22:02:21 +08:00
Shivraj Patil
a4d97d980f Added rot functions.
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2017-01-17 12:15:07 +05:30
Ashwin Sekhar T K
f279ff4789 THUNDERX2T99: Add Optimized SGEMM Implementation 2017-01-16 21:44:33 +05:30
Ashwin Sekhar T K
759f37feba ARM64: Let target VULCAN inherit THUNDERX2T99 properties 2017-01-16 21:44:19 +05:30
Martin Kroeker
e8d0e66982 Merge pull request #1067 from martin-frbg/msysinst
Fix DESTDIR support for cygwin/msys2 install
2017-01-16 16:03:53 +01:00
Martin Kroeker
331fd51260 Fix DESTDIR support for cygwin/msys2 install
fixes #1066
2017-01-16 15:15:46 +01:00
Zhang Xianyi
0863a0d4b4 Merge pull request #1061 from ashwinyes/develop_aarch64_vulcan_thunderx_patch
Add new targets for ARM64
2017-01-16 13:20:10 +08:00
Martin Kroeker
2e5f906f41 Update Makefile.install (#1064)
* Update Makefile.install to reflect name change of lapacke_mangling.h source
2017-01-11 17:40:06 +01:00
Werner Saar
d1a97bad39 Merge pull request #1063 from wernsaar/develop
prepared kernel/setparam-ref.c for UNROLL values, that are not a power of two
2017-01-11 12:37:45 +01:00
Werner Saar
28e2fab33e prepared kernel/setparam-ref.c for UNROLL values, that are not a power of two 2017-01-11 11:56:50 +01:00
Werner Saar
752fdc6f82 Merge pull request #1062 from wernsaar/develop
prepared parameter.c for UNROLL values, that are not a power of two
2017-01-11 10:30:46 +01:00
Werner Saar
c1c5a63d3c prepared parameter.c for UNROLL values, that are not a power of two 2017-01-11 09:50:28 +01:00
Werner Saar
209b63197e prepared lapack/lauum for UNROLL values, that are not a power of two 2017-01-11 07:29:17 +01:00
Ashwin Sekhar T K
4b55fae337 ARM64: Add Cavium THUNDERX2T99 Target 2017-01-11 11:18:40 +05:30
Ashwin Sekhar T K
738d622feb ARM64: Fix auto detect of ARM64 cpus 2017-01-11 11:18:40 +05:30
Andrew Pinski
95649dee28 THUNDERX: Add optimized version of daxpy
This is better for single core but does not change anything for multiple cores
2017-01-11 11:18:36 +05:30
Martin Kroeker
3a8c5180b9 Merge pull request #1060 from martin-frbg/lapacke-mingw
Split LAPACKE 3.7.0 obj list (take 2, missed splitting the actual ar command invocation)
2017-01-10 19:09:49 +01:00
Martin Kroeker
7611a41f40 Split LAPACKE 3.7.0 obj list (take 2)
Missed the splitting of the actual ar call
2017-01-10 17:11:35 +01:00
Werner Saar
1a39b92b1d Merge pull request #1059 from wernsaar/develop
updated some level1 funcions, that are not thread save
2017-01-10 16:00:28 +01:00
Werner Saar
dd6212e684 updated some level1 funcions, that are not thread save 2017-01-10 14:05:07 +01:00
Werner Saar
9bcf50872b Merge pull request #1058 from wernsaar/develop
prepared lapack/potrf functions for UNROLL values, that are not a pow…
2017-01-10 11:30:08 +01:00
Werner Saar
c81dc6322f prepared lapack/potrf functions for UNROLL values, that are not a power of two 2017-01-10 10:50:28 +01:00
Andrew Pinski
8fdb0655e9 THUNDERX: Add an optimized version of ddot 2017-01-10 15:01:37 +05:30
Andrew Pinski
fb200c7245 ARM64: Add Cavium THUNDERX Target 2017-01-10 15:01:37 +05:30
Ashwin Sekhar T K
0b8e876d89 VULCAN: Add optimized DGEMM implementation 2017-01-10 15:01:37 +05:30
Ashwin Sekhar T K
4713e7c47f ARM64: Add the VULCAN Target 2017-01-10 15:01:17 +05:30
Ashwin Sekhar T K
6085386b10 CORTEXA57: Add assembly kernels for copy routines 2017-01-10 15:01:05 +05:30
Zhang Xianyi
002b41f024 Merge pull request #1055 from ksraste/develop
Add msa optimization for AXPY, COPY, SCALE, SWAP
2017-01-10 13:58:26 +08:00
jiahaipeng
84b8170bfb Adding multi-threading for copy, dot, rot, and asum funcitons 2017-01-10 11:48:58 +08:00
jiahaipeng
1aa1e6cb54 modify the blas_l1_thread.c for support multi-threded for L1 fuction with return value 2017-01-10 11:47:06 +08:00
Martin Kroeker
cbd2bf1f6e Merge pull request #1057 from martin-frbg/lapacke-mingw
Split the obj list of LAPACKE 3.7.0
2017-01-09 20:45:26 +01:00
Martin Kroeker
9f5cfd43dc Split the obj list of LAPACKE 3.7.0
Split obj list to allow building with mingw (argument list too long for the msys ar)
2017-01-09 18:29:53 +01:00
kaustubh
1480f3df71 Add msa optimization for AXPY, COPY, SCALE, SWAP
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2017-01-09 18:27:23 +05:30
kaustubh
88afb3bc94 Add msa optimization for AXPY, COPY, SCALE, SWAP
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2017-01-09 18:22:09 +05:30
Werner Saar
2ffbbb54f6 Merge pull request #1054 from wernsaar/develop
prepared lapack/getrf functions for UNROLL values, that are not a pow…
2017-01-09 13:38:56 +01:00
Werner Saar
3e1bbd6b5f prepared lapack/getrf functions for UNROLL values, that are not a power of two 2017-01-09 12:57:26 +01:00
Zhang Xianyi
b678471d65 Merge branch 'z13' into develop
Conflicts:
	CONTRIBUTORS.md
2017-01-09 05:52:42 -05:00
Zhang Xianyi
864e202afd Add USE_TRMM=1 for IBM z13 in kernel/Makefile.L3 2017-01-09 05:48:09 -05:00
Werner Saar
b9bb009236 Merge pull request #1053 from wernsaar/develop
prepared driver/level3 functions for UNROLL values, that are not a po…
2017-01-09 11:17:38 +01:00
Werner Saar
a2672d5589 prepared driver/level3 functions for UNROLL values, that are not a power of two 2017-01-09 10:38:15 +01:00
Zhang Xianyi
c2496d8f48 Merge pull request #1050 from martin-frbg/fflags
Apply COMMON_OPT to default FFLAGS
2017-01-09 16:23:22 +08:00
Zhang Xianyi
fb0afdaf99 Merge pull request #1052 from martin-frbg/locking
Fix thread data races detected by helgrind 3.12
2017-01-09 16:22:58 +08:00
Martin Kroeker
51aa157e64 Relocate declaration of alloc_lock outside ifdef block 2017-01-09 01:10:43 +01:00
Martin Kroeker
87c7d10b34 Fix thread data races detected by helgrind 3.12
Ref. #995, may possibly help solve issues seen in 660,883
2017-01-08 23:33:51 +01:00
Martin Kroeker
d0035b857d Apply COMMON_OPT to default FFLAGS to avoid building non-optimized LAPACK by mistake 2017-01-08 21:17:22 +01:00
Werner Saar
c61a7cd293 Merge pull request #1049 from wernsaar/develop
removed blas_thread_shutdown from gensymbol
2017-01-08 09:30:19 +01:00
Werner Saar
a8bb5003de removed blas_thread_shutdown from gensymbol 2017-01-08 08:51:30 +01:00
Zhang Xianyi
9a48adff3f Merge pull request #1047 from brada4/erre
Improve R benchmark timing
2017-01-08 11:19:06 +08:00
Zhang Xianyi
823a40a110 Merge pull request #1040 from martin-frbg/develop
Use appropriate int32/int64 format for error number in message string
2017-01-08 11:18:38 +08:00
Zhang Xianyi
0bd706ac8d Merge pull request #1036 from sva-img/develop
Added prefetch to CGEMV and ZGEMV.
2017-01-08 11:18:05 +08:00
Andrew
8379550076 anti GC and reflow 2017-01-07 19:01:42 +01:00
Andrew
fc148b7e4d init 2017-01-07 19:01:21 +01:00
Werner Saar
5bb2b91a03 Merge pull request #1046 from wernsaar/develop
updated lapack to version 3.7.0 with latest patches from git
2017-01-07 15:09:56 +01:00
Werner Saar
abc3304587 fix for appveyor test 2017-01-07 14:27:08 +01:00
Werner Saar
a836fe8ec1 updated exports/gensymbol for lapack-3.7.0 2017-01-07 13:20:28 +01:00
Werner Saar
1153e3ac39 filtered out -fopenmp and fix for mingw 2017-01-07 08:41:42 +01:00
Werner Saar
7c2c488c23 removed xerbla and lsame for Makefile 2017-01-06 16:35:20 +01:00
Werner Saar
ae4ac6f984 removed obj-files, that are moved to lapack 3.7.0 2017-01-06 16:14:53 +01:00
Werner Saar
4494d03a21 filtered out optimized functions 2017-01-06 13:42:31 +01:00
Werner Saar
d35baf30cf added lapack 3.7.0 with latest patches from git 2017-01-06 11:48:40 +01:00
Werner Saar
24efbbd339 removed lapack-devel.log 2017-01-06 11:46:58 +01:00
Werner Saar
8cd46acebb removed lapack 3.6.0 2017-01-06 11:44:57 +01:00
Martin Kroeker
9e4b6971e2 Merge pull request #1043 from quickwritereader/z13
Z13
2017-01-05 19:15:36 +01:00
Martin Kroeker
0ef7841473 Update xerbla.c 2017-01-04 23:16:48 +01:00
Abdurrauf
7f2a959e3e Update README.md 2017-01-04 19:41:24 +04:00
Abdurrauf
6418667818 dtrmm and dgemm for z13 2017-01-04 19:32:33 +04:00
Martin Kroeker
104ad066af Use appropriate int32/int64 format for error number in message string 2016-12-30 00:45:59 +01:00
Shivraj Patil
a9bf8a781a Added prefetch to CGEMV and ZGEMV.
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-12-27 11:33:51 +05:30
Zhang Xianyi
8f9975e013 Merge pull request #1032 from kiwifb/OSX_target
Do not override MACOSX_DEPLOYMENT_TARGET if it is already defined.
2016-12-18 14:48:22 +08:00
Zhang Xianyi
e7bd736802 Merge pull request #1025 from mfoster96/develop
Fix for issue #1024: arm-linux-androideabi-g++ Compiler Error in /cpu…
2016-12-18 14:47:59 +08:00
Zhang Xianyi
14571d8b08 Merge pull request #1031 from kiwifb/make
Never use "make" in makefiles. Only $(MAKE).
2016-12-18 14:46:52 +08:00
Zhang Xianyi
82d829c670 Merge pull request #1030 from ksraste/develop
Updated data prefetch in TRSM, ASUM, DOT functions
2016-12-18 14:46:16 +08:00
François Bissey
32ca9a9f68 Do not override MACOSX_DEPLOYMENT_TARGET if it is already defined. 2016-12-15 11:42:17 +13:00
François Bissey
c732f1a066 Never use "make" in makefiles. Only $(MAKE). 2016-12-15 11:38:23 +13:00
kaustubh
5f93aa5f87 Updated data prefetch in TRSM, ASUM, DOT functions
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-12-14 14:05:11 +05:30
kaustubh
9db451acd0 Updated data prefetch in TRSM, ASUM, DOT functions
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-12-13 14:02:14 +05:30
kaustubh
3eaff85191 Updated data prefetch in TRSM, ASUM, DOT functions
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-12-13 11:41:17 +05:30
Zhang Xianyi
db3efb2e14 Merge pull request #1017 from martin-frbg/develop
Make c_check, f_check convert any --exclude-libs arguments to linker flags
2016-12-09 16:55:13 +08:00
Michael Foster
19934bea8c Fix for issue #1024: arm-linux-androideabi-g++ Compiler Error in /cpuid_arm.c
Line 77: Compiler requires non-void function to return a value
2016-12-02 09:28:31 -08:00
Zhang Xianyi
433edc74ab Merge pull request #1015 from ararslan/aa/freebsd
Include system headers for blas_server on FreeBSD
2016-12-02 10:28:57 +08:00
Zhang Xianyi
99a6289d3c Merge pull request #996 from grisuthedragon/lapack-3.6.1
Lapack 3.6.1
2016-12-02 10:27:51 +08:00
Martin Kroeker
e7732d9941 Convert --exclude-libs argument to linker flag
Fixes build with TDM-GCC
2016-11-22 09:17:03 +01:00
Martin Kroeker
fa216717bf Convert --exclude-libs argument to linker flag
Fixes build with TDM-GCC
2016-11-22 09:14:55 +01:00
Zhang Xianyi
f8a6e6cce4 Merge pull request #1016 from ksraste/develop
Add data prefetch in DOT and ASUM functions
2016-11-22 15:54:56 +08:00
kaustubh
00abce3b93 Add data prefetch in DOT and ASUM functions
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-11-22 11:21:03 +05:30
Alex Arslan
a16ace68f5 Include system headers on FreeBSD 2016-11-16 21:58:20 -08:00
Zhang Xianyi
d4da3fbe9f Merge pull request #1002 from brada4/limpio
Remove few lines of dead code.
2016-11-07 10:41:20 +08:00
Zhang Xianyi
b590cd45fc Merge pull request #1010 from martin-frbg/cpuid
Add TARGETs for newer Intel CPUs - Kaby Lake, Knights Landing, Apollo Lake
2016-11-07 10:26:13 +08:00
Zhang Xianyi
10a62cb595 Merge pull request #1009 from martin-frbg/getarch-newline-fix
Getarch newline fix
2016-11-07 10:25:51 +08:00
Martin Kroeker
9dbdc7b2cf Add files via upload 2016-11-06 23:27:30 +01:00
Martin Kroeker
596ead0f8d Add files via upload 2016-11-06 23:26:39 +01:00
Martin Kroeker
60816c9259 Add files via upload 2016-11-06 23:26:04 +01:00
Martin Kroeker
aaba3e483f Add files via upload 2016-11-06 17:38:20 +01:00
Martin Kroeker
c295d61e82 Delete CMakeLists.txt 2016-11-06 17:37:37 +01:00
Martin Kroeker
570bc9afbd Fix spurious define in openblas_config.h
TARGET as specified with make is already return-terminated when getarch reads it. This led to an empty line written to config_last.h that awk in Makefile.install then expanded to a spurious "#define OPENBLAS_" in openblas_config.h (as noted by "kmb" on the mailing list)
2016-11-06 17:29:33 +01:00
Martin Kroeker
f00baf32a6 Update CMakeLists.txt 2016-11-05 13:38:57 +01:00
Martin Kroeker
8e7f2809af Update CMakeLists.txt 2016-11-05 13:30:40 +01:00
Martin Kroeker
81ac8aab81 Update CMakeLists.txt 2016-11-05 13:26:01 +01:00
Martin Kroeker
ce372062d6 Update CMakeLists.txt 2016-11-05 13:11:32 +01:00
Martin Kroeker
6c7b9b74f6 Update CMakeLists.txt 2016-11-05 13:05:05 +01:00
Martin Kroeker
f5b028eb37 Update CMakeLists.txt 2016-11-05 12:59:05 +01:00
Martin Kroeker
cebcca9987 Update CMakeLists.txt 2016-11-05 12:47:15 +01:00
Martin Kroeker
084e4573c1 Consolidate debug options
Use BUILD_DEBUG option only if CMAKE_BUILD_TYPE is not set
Consolidate debug postfixes in install target
2016-11-05 11:55:45 +01:00
Andrew
becf8bc7a0 remove dead code 2016-10-31 12:46:56 +01:00
Andrew
4e54d8ab7f new branch 2016-10-29 23:44:02 +02:00
Martin Koehler
6f58271190 Move remaining OpenBLAS related changes from 3.6.0 to 3.6.1 2016-10-26 21:43:41 +02:00
Martin Koehler
311e0a912c Fix #971 2016-10-26 21:34:56 +02:00
Martin Koehler
429dfd83ee Fix threshold in nep.in 2016-10-26 21:17:12 +02:00
Martin Köhler
5af06c764a Fix MingW build 2016-10-26 16:03:00 +02:00
Martin Köhler
7cd26f7e38 Update gitignore 2016-10-26 15:19:40 +02:00
Martin Köhler
77006cc2a3 Import LAPACK: top directory 2016-10-26 15:14:13 +02:00
Martin Köhler
57eee3fa43 Import LAPACK: TESTING directory 2016-10-26 15:13:03 +02:00
Martin Köhler
92a858e69e Import LAPACK: SRC directory 2016-10-26 15:12:09 +02:00
Martin Köhler
13d40e7591 Import LAPACK: LAPACKE directory 2016-10-26 15:06:08 +02:00
Martin Köhler
4c29d20108 Import LAPACK: INSTALL directory 2016-10-26 15:04:39 +02:00
Martin Köhler
1357b8d93b Import LAPACK: DOCS directory 2016-10-26 15:03:51 +02:00
Martin Köhler
fb7057babe Import LAPACK: CMAKE directory 2016-10-26 15:03:16 +02:00
Martin Köhler
358ee318ed Import LAPACK: CBLAS directory 2016-10-26 15:02:41 +02:00
Martin Köhler
4c024b85e4 Import LAPACK: BLAS directory 2016-10-26 15:02:09 +02:00
Martin Kroeker
e9ccbe738c Add CMAKE install target
Add CMAKE install target (copied from a patch provided by PrimarchOfTheSpaceWolves in #957)
2016-10-19 15:27:22 +02:00
Zhang Xianyi
e54b6ddaa0 Merge pull request #986 from ksraste/develop
SGEMM, DGEMM, CGEMM, ZGEMM functions data prefetch
2016-10-18 12:38:52 +08:00
Zhang Xianyi
bcfc298c38 Merge pull request #987 from Sbte/master
Fix HASWELL capitalization in kernel cmake file
2016-10-18 12:38:33 +08:00
Sven Baars
ce7c6c6b2d Fix HASWELL capitalization in kernel cmake file
Refs #951
2016-10-17 16:34:13 +02:00
kaustubh
f3419e634c SGEMM, DGEMM, CGEMM, ZGEMM functions data prefetch
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-10-17 18:29:38 +05:30
Zhang Xianyi
7472c79ea6 Merge pull request #984 from ksraste/develop
STRSM, DTRSM functions data prefetch
2016-10-17 11:33:16 +08:00
Zhang Xianyi
66c9a9b33d Merge pull request #981 from howard0su/develop
USE NPROCESSOR_CONF instaed of NPORCESSOR_ONLN
2016-10-17 11:32:57 +08:00
Zhang Xianyi
3705f5675a Merge pull request #982 from martin-frbg/develop
Change file comments to work around clang 3.9 assembler bug; add support for Bay Trail atom
2016-10-17 11:32:20 +08:00
Martin Kroeker
bce2b34f7a Merge pull request #1 from martin-frbg/martin-frbg-patch-1
Add Intel "Bay Trail" atom cpu
2016-10-16 22:51:42 +02:00
Martin Kroeker
da83ec94d1 Merge pull request #2 from martin-frbg/martin-frbg-patch-1-1
Update cpuid_x86.c
2016-10-16 22:48:58 +02:00
Martin Kroeker
3409bccb21 Update cpuid_x86.c
Add Bay Trail "Pentium N3520" atom cpu
2016-10-16 22:45:44 +02:00
Martin Kroeker
8a8f3932eb Update dynamic.c
Add Bay Trail "Pentium N3520" atom
2016-10-16 22:40:00 +02:00
kaustubh
90e2321ac3 STRSM, DTRSM functions data prefetch
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
2016-10-14 16:41:28 +05:30
Martin Kroeker
4998e19869 Change file comments to work around clang 3.9 assembler bug 2016-10-13 16:51:08 +02:00
Howard Su
ff1da01476 USE NPROCESSOR_CONF instaed of NPORCESSOR_ONLN
to determine the number of CPU. In ARM platform,
online CPU will increasing when there is more workload.
while configure cpu is the max number of CPU.
2016-10-13 12:37:50 +00:00
Zhang Xianyi
ef52a9266b Fixed #979. Patch for NetBSD. 2016-10-13 10:17:07 +08:00
Zhang Xianyi
4f38ae3199 Merge pull request #970 from martin-frbg/develop
Remove implicit inclusions of complex.h in various zdot implementations
2016-10-13 10:13:56 +08:00
Zhang Xianyi
4baf0c7cfc Merge pull request #980 from kiwifb/utest_ldflags
make utest/Makefile respect LDFLAGS
2016-10-13 10:13:12 +08:00
Zhang Xianyi
595a0224e4 Merge pull request #973 from vladimir-ch/fix-lapacke-xlarfb
LAPACKE: fix wrong direction check in LAPACKE_?larfb_work
2016-10-13 10:12:35 +08:00
François Bissey
f124ffab47 make utest/Makefile respect LDFLAGS 2016-10-13 09:32:25 +13:00
Martin Kroeker
91610f3835 Update zdot_msa.c 2016-10-05 18:59:09 +02:00
Martin Kroeker
6e22ecf102 Update zdot.c 2016-10-05 18:58:03 +02:00
Martin Kroeker
6221d6df5f Update zdot.c 2016-10-05 18:57:14 +02:00
Vladimir Chalupecky
117d3371d4 LAPACKE: fix wrong direction check in LAPACKE_?larfb_work
Closes #971
2016-10-01 05:31:30 +09:00
Martin Kroeker
16446d1d23 Remove explicit include of complex.h 2016-09-29 23:45:56 +02:00
Martin Kroeker
a6e9e0b94b Remove explicit include of complex.h 2016-09-29 23:43:28 +02:00
Martin Kroeker
3178e4fea0 Remove explicit include of complex.h 2016-09-29 23:41:43 +02:00
Martin Kroeker
95c245ddb0 Remove explicit include of complex.h 2016-09-29 23:40:36 +02:00
Martin Kroeker
4b1b27347f Remove explicit include of complex.h 2016-09-29 23:39:35 +02:00
Zhang Xianyi
161c927071 Merge pull request #968 from buffer51/develop
Updated CROSS_SUFFIX regex to work with CC containing arguments
2016-09-22 11:34:57 -04:00
Zhang Xianyi
662f89f059 Merge pull request #969 from sva-img/develop
DGEMM function split and data prefech
2016-09-22 11:33:51 -04:00
Shivraj Patil
54747fe24a DGEMM function split and data prefech
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-09-22 17:25:46 +05:30
Paul MUSTIÈRE
157ee498ac Updated CROSS_SUFFIX regex to work with CC containing arguments 2016-09-14 11:42:22 -07:00
Zhang Xianyi
b09cc3b9bb Merge pull request #958 from intelfx/remove-stabs
common_arm.h, common_mips.h: get rid of .func directives
2016-09-13 16:15:37 -04:00
Ivan Shapovalov
6c0862a94f common_arm.h, common_mips.h: get rid of .func directives
.func/.endfunc are gcc/gas-specific directives for generating stabs
debug information (and nothing more). This is near-useless now because
DWARF is commonly used, and not implemented in Clang. Hence building
OpenBLAS with Clang fails, and there is no sane way to detect GCC vs.
anything else with preprocessor definitions.

Hence, just remove these directives.
2016-09-09 03:37:11 +03:00
Zhang Xianyi
842d842751 Update develop for 0.2.20.dev. 2016-09-01 00:01:23 -04:00
Zhang Xianyi
85636ff1a0 Merge branch 'develop' 2016-08-31 23:58:42 -04:00
Zhang Xianyi
821affb9a0 Update doc for 0.2.19. 2016-08-31 23:58:29 -04:00
Zhang Xianyi
515bc56ea9 Refs #946. Use nrm2 reference implementation for Power8. 2016-08-18 18:59:43 -07:00
Zhang Xianyi
ae70b916f4 Refs #929. Deal with zero and NaNs for scale. 2016-08-18 10:24:42 -07:00
Zhang Xianyi
9ea0144482 Merge pull request #941 from sva-img/develop
MIPS n32 ABI support, MSA support detection and rename ARCH, ARCHFLAGS
2016-08-18 09:31:31 -04:00
Zhang Xianyi
1f217a6175 Merge pull request #943 from ibmsoe/IBMMASS_Support
Added support of IBM's MASS library that optimizes performance on Pow…
2016-08-12 17:20:59 -04:00
nishidha@us.ibm.com
78348a2853 Added support of IBM's MASS library that optimizes performance on Power architectures 2016-08-11 14:43:26 +05:30
Shivraj Patil
9687437928 MIPS n32 ABI and build time mips simd support check
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-08-10 17:44:22 +05:30
Shivraj Patil
d1c6469283 MIPS n32 ABI support, MSA support detection and rename ARCH, ARCHFLAGS
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-08-08 11:58:01 +05:30
Zhang Xianyi
b544be914d Merge pull request #933 from ashwinyes/develop_aarch64_20160726_Dgemm_8x4_Opts
Cortex A57: Improvements to DGEMM 8x4 kernel
2016-07-26 09:54:31 -04:00
Ashwin Sekhar T K
c54a29bb48 Cortex A57: Improvements to DGEMM 8x4 kernel 2016-07-26 10:58:21 +05:30
Zhang Xianyi
ff4c5deafa Merge pull request #930 from sva-img/develop
P6600/I6400 Build fix.
2016-07-22 11:42:30 -04:00
Shivraj Patil
22b9c2747d P6600/I6400 Build fix. Reverted the changes which was done to support for MIPS n32 ABI
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-07-22 18:45:06 +05:30
Zhang Xianyi
27b5211ccd Merge pull request #927 from sva-img/develop
Added MSA optimization for GEMV_N, GEMV_T, ASUM, DOT functions
2016-07-15 11:17:30 -04:00
Shivraj Patil
beb1d076a4 Added MSA optimization for GEMV_N, GEMV_T, ASUM, DOT functions
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-07-15 18:38:25 +05:30
Zhang Xianyi
9e44f3ddd0 Refs #917 Avoid detecting gfortran bug on IBM POWER + Ubuntu 2016-07-14 13:09:36 -07:00
Zhang Xianyi
eece9fd889 Merge pull request #926 from vriera/develop
Complete support for MIPS n32 ABI
2016-07-14 15:49:33 -04:00
Zhang Xianyi
5dfa0712c3 Merge pull request #925 from martin-frbg/develop
Update zgetrf2.f, cpuid_x86.c, dynamic.c
2016-07-14 15:48:58 -04:00
Zhang Xianyi
8a592ee386 Merge pull request #924 from ashwinyes/develop_aarch64_improvements_20160714
Improvements to Aarch64 kernels
2016-07-14 15:47:55 -04:00
Zhang Xianyi
7f2409a8e1 Merge pull request #918 from sva-img/develop
Added CGEMM, ZGEMM, STRMM, DTRMM, CTRMM, ZTRMM.
2016-07-14 15:45:39 -04:00
Vicente Olivert Riera
7f28cd1f88 Complete support for MIPS n32 ABI
Signed-off-by: Vicente Olivert Riera <Vincent.Riera@imgtec.com>
2016-07-14 17:51:04 +01:00
Martin Kroeker
154729908e Update cpuid_x86.c 2016-07-14 17:29:34 +02:00
Martin Kroeker
97bd1e42c8 Update cpuid_x86.c 2016-07-14 12:25:17 +02:00
Martin Kroeker
7de829f713 Update dynamic.c
Add Braswell (extended model 4, model 12) N3150 as Nehalem
2016-07-14 12:22:55 +02:00
Martin Kroeker
9b69d8a8e5 Update zgetrf2.f
Trivial typo correction (ZERBLA => XERBLA) to fix #910
2016-07-14 11:41:57 +02:00
Ashwin Sekhar T K
0a5ff9f9f9 Improvements to TRMM and GEMM kernels 2016-07-14 13:56:04 +05:30
Ashwin Sekhar T K
8a40f1355e Improvements to GEMV kernels 2016-07-14 13:50:38 +05:30
Ashwin Sekhar T K
78782485b6 Improvements to COPY and IAMAX kernels 2016-07-14 13:49:34 +05:30
Ashwin Sekhar T K
8d86d14d3f Add time prints in benchmark output 2016-07-14 13:48:13 +05:30
Ashwin Sekhar T K
925d4e1dc6 Add IAMAX and NRM2 benchmarks 2016-07-14 13:46:01 +05:30
Shivraj Patil
57df7956ee Added CGEMM, ZGEMM, STRMM, DTRMM, CTRMM, ZTRMM. Updated macros in SGEMM, DGEMM, STRMM.
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-06-28 17:51:10 +05:30
Zhang Xianyi
437c7d64f2 Merge pull request #913 from dpfoose/develop
Small change to allow compiling with USE_OPENMP on MSVC
2016-06-27 10:05:30 -04:00
Zhang Xianyi
ca5c25c870 Merge pull request #907 from jeromerobert/bug786
Fix z/ctrmv stack allocation on AMD bulldozer and barcelona target
2016-06-27 10:04:54 -04:00
Zhang Xianyi
4a30a2584a Merge pull request #897 from ksraste/develop
STRSM optimized for MSA
2016-06-27 10:04:18 -04:00
mdong
098d8ec5d6 remove input from clobbered list 2016-06-24 16:37:58 -04:00
Daniel Patrick Foose
a94f2b7848 Change to allow compiling with USE_OPENMP on MSVC
MSVC treats the declaration of omp_in_parallel and omp_get_num_procs without the modifiers __declspec(dllimport) and __cdecl as a redefinition.
2016-06-14 14:37:28 -04:00
Jerome Robert
d346c533b1 Fix z/ctrmv stack allocation on AMD bulldozer and barcelona target
* Hopefully, because this was found by error and trial (dark magic)
* Ref #786
2016-06-07 16:11:09 +02:00
Werner Saar
f04af36ad0 Merge pull request #898 from wernsaar/develop
added experimental support for optimized lapack fortran functions
2016-05-31 14:13:52 +02:00
Werner Saar
41000c8443 added directory for optimized lapack fortan codes and added dlaqr5.f 2016-05-31 12:53:07 +02:00
Kaustubh Raste
011431b9d7 STRSM optimized for MSA
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
2016-05-31 10:17:23 +05:30
Kaustubh Raste
c8a7860eb3 STRSM optimized
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
2016-05-30 21:17:00 +05:30
Zhang Xianyi
2daad2bcb5 Merge pull request #893 from biddisco/develop
Replace CMAKE_SOURCE_DIR/CMAKE_BINARY_DIR with PROJECT_SOURCE_DIR/PRO…
2016-05-30 14:52:58 +08:00
Zhang Xianyi
bac478d17e Merge pull request #891 from rndfax/develop
mips64/axpy: fix error when INCY == 0
2016-05-30 14:52:40 +08:00
John Biddiscombe
053044ae4d Replace CMAKE_SOURCE_DIR/CMAKE_BINARY_DIR with PROJECT_SOURCE_DIR/PROJECT_BINARY_DIR
If OpenBLAS is built using add_subdirectory(OpenBlas) as part of another project
then the paths set by CMAKE_XXX_DIR are relative to the parent project
and not the OpenBLAS project.
2016-05-25 09:13:28 +02:00
Aleksey Kuleshov
fca66262c4 mips64/axpy: fix error when INCY == 0 2016-05-23 13:30:27 +03:00
Werner Saar
412bcd187a optimized dtrsm_logic_LT_16x4_power8.S and dtrsm_macros_LT_16x4_power8.S 2016-05-23 11:20:41 +02:00
Werner Saar
bd06b246cc Merge pull request #890 from wernsaar/develop
optimized dtrsm_kernel_LT for POWER8
2016-05-22 16:01:35 +02:00
Werner Saar
8b140220c8 optimized dtrsm_kernel_LT for POWER8 2016-05-22 15:20:04 +02:00
Werner Saar
318cad9c37 added trsm bencharks for POWER8 to benchmark/Makefile 2016-05-22 13:51:47 +02:00
Werner Saar
8fb5a1aaff added optimized dtrsm_LT kernel for POWER8 2016-05-22 13:09:05 +02:00
Zhang Xianyi
7d0358475d Merge the patch for musl libc. 2016-05-22 01:08:44 +08:00
Zhang Xianyi
b46f680f01 Merge pull request #887 from ksraste/develop
STRSM optimization for MIPS P5600 and I6400 using MSA
2016-05-21 07:17:21 +08:00
Kaustubh Raste
ad9f317870 STRSM optimization for MIPS P5600 and I6400 using MSA
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
2016-05-20 10:59:03 +05:30
Zhang Xianyi
a8fcd89d6d Merge pull request #886 from vriera/develop
Makefile.system: P5600 and I6400 cores need -mmsa
2016-05-19 19:59:09 +08:00
Zhang Xianyi
232335fd49 Merge pull request #885 from sva-img/develop
SGEMM optimization for MIPS P5600 and I6400 using MSA.
2016-05-19 19:58:32 +08:00
Vicente Olivert Riera
e12cff87b8 Makefile.system: P5600 and I6400 cores need -mmsa
Signed-off-by: Vicente Olivert Riera <Vincent.Riera@imgtec.com>
2016-05-19 10:56:53 +01:00
Shivraj Patil
c4ba40e308 SGEMM optimization for MIPS P5600 and I6400 using MSA. Unrolled k loop in DGEMM kernel function
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-05-19 11:04:42 +05:30
Zhang Xianyi
7a19065369 Merge pull request #878 from ksraste/develop
DTRSM bug fix for MIPS P5600 and I6400
2016-05-19 11:16:43 +08:00
Werner Saar
8a149e6294 Merge pull request #879 from wernsaar/develop
optimized dgemm and dgetrf for POWER8
2016-05-17 17:10:36 +02:00
Werner Saar
956be69e1d optimized getrf_single.c for POWER8 2016-05-17 16:19:53 +02:00
Werner Saar
6a2bde7a2d optimized dgemm and dgetrf for POWER8 2016-05-17 14:45:27 +02:00
Kaustubh Raste
d7cbc7ac13 DTRSM bug fix for MIPS P5600 and I6400
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
2016-05-17 15:48:02 +05:30
Zhang Xianyi
8bf71e9e06 Merge pull request #877 from jeromerobert/bug873
Disable multi-threading in swap
2016-05-16 23:21:56 +08:00
Jerome Robert
40af513669 Disable multi-threading in swap
* Close #873
2016-05-16 13:07:55 +00:00
Werner Saar
88011f625d Merge pull request #876 from wernsaar/develop
optimized dgemm on power8 for 20 threads
2016-05-16 14:52:40 +02:00
Werner Saar
8310d4d3f7 optimized dgemm for 20 threads 2016-05-16 14:14:25 +02:00
Zhang Xianyi
5faffc123f Merge pull request #869 from ksraste/develop
DTRSM optimization for MIPS P5600 and I6400 using MSA
2016-05-09 10:54:55 -04:00
Zhang Xianyi
81794ccb9a Merge pull request #868 from sva-img/develop
build fix for MIPS 32 bit
2016-05-09 10:54:30 -04:00
Kaustubh Raste
edb5980c13 DTRSM optimization for MIPS P5600 and I6400 using MSA
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
2016-05-09 15:15:26 +05:30
Shivraj Patil
573d9218f2 build fix for MIPS 32 bit
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-05-09 14:45:12 +05:30
Zhang Xianyi
7e549d5f37 Merge pull request #866 from sva-img/develop
DGEMM optimization for MIPS P5600 and I6400 using MSA
2016-05-06 10:53:22 -04:00
Shivraj Patil
085cf236c2 conflict resolved by syncing with 'xianyi:develop'
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-05-04 11:07:14 +05:30
Zhang Xianyi
0d1c695508 Merge pull request #867 from IvanUkhov/space
Wrap CURDIR and DESTDIR in quotes
2016-05-03 17:06:31 -04:00
Ivan Ukhov
efaf30d536 Wrap CURDIR and DESTDIR in quotes 2016-05-03 21:31:32 +02:00
Shivraj Patil
b7b3d8ec8e DGEMM optimization for MIPS P5600 and I6400 using MSA
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-05-03 14:42:26 +05:30
Zhang Xianyi
2df60f7315 Merge pull request #863 from ashwinyes/develop_20160429_update_numa_binding
Update NUMA CPU binding
2016-04-29 11:46:24 -04:00
Zhang Xianyi
cd7af5260a Merge pull request #847 from sva-img/develop
MIPS P5600(32 bit) and I6400(64 bit) cores support added.
2016-04-29 11:44:36 -04:00
Werner Saar
3a2e8c3537 Merge pull request #864 from wernsaar/develop
optimized dgemm for POWER8
2016-04-29 13:33:45 +02:00
Werner Saar
56948dbf0f optimized dgemm for POWER8 2016-04-29 12:52:47 +02:00
Ashwin Sekhar T K
0fb380c966 Update NUMA CPU binding
When the number of process can all be
accommodated within the current node,
then use cores from the current node only.
2016-04-29 11:58:15 +05:30
Zhang Xianyi
c95f5008fe Merge pull request #858 from buffer51/develop
Fixed cross-suffix detection for path that contains dashes
2016-04-28 11:42:18 -04:00
buffer51
708dec5bb7 Use CROSS_SUFFIX only if CROSS is set 2016-04-27 22:23:02 -07:00
buffer51
20b0ed1da5 Fixed cross-suffix detection for path that contains dashes when the compiler itself doesn't 2016-04-27 12:09:44 -07:00
Werner Saar
6f43310de5 Merge pull request #856 from wernsaar/develop
optimized dgemm for POWER8
2016-04-27 16:34:15 +02:00
Werner Saar
782f75ba94 optimized param.h for POWER8 2016-04-27 15:48:09 +02:00
Werner Saar
0d0c6f7d7d optimized dgemm for POWER8 2016-04-27 14:01:08 +02:00
Zhang Xianyi
0551e571dd Merge pull request #852 from buffer51/develop
Added Android as a community-supported OS
2016-04-26 10:24:33 -04:00
Zhang Xianyi
7e253607c2 Merge pull request #851 from rndfax/develop
allow building tests when CROSS compiling but don't run them
2016-04-26 10:24:13 -04:00
buffer51
b5e98e4dda Added Android as a community-supported OS 2016-04-26 03:14:03 -07:00
Aleksey Kuleshov
3d50ccdc0d allow building tests when CROSS compiling but don't run them 2016-04-26 12:36:47 +03:00
Werner Saar
6abec09eb4 Merge pull request #850 from wernsaar/develop
Bugfixes and enhancements for EXCAVATOR
2016-04-25 12:00:43 +02:00
Werner Saar
40ac64ae4f updated param.h for EXCAVATOR 2016-04-25 10:40:04 +02:00
Werner Saar
298b13bba4 updated some kernel files for EXCAVATOR 2016-04-25 10:36:23 +02:00
Werner Saar
78b05f6476 bugfix for EXCAVATOR and DYNAMIC_ARCH 2016-04-25 10:13:30 +02:00
Werner Saar
2b967590a0 bugfix in dynamic.c 2016-04-25 09:08:38 +02:00
Werner Saar
91b4233e06 Merge pull request #849 from wernsaar/develop
optimized gemm for POWER8
2016-04-23 16:25:27 +02:00
Werner Saar
089aad57f7 updated param.h for POWER8 2016-04-23 14:26:24 +02:00
Werner Saar
a3da10662f added sgemm_tcopy_8_power8.S 2016-04-23 10:04:41 +02:00
Werner Saar
d46f07bb4e added cgemm_tcopy_8_power8.S 2016-04-23 07:37:18 +02:00
Werner Saar
a670e8061e Merge pull request #848 from wernsaar/develop
Optimized zgemm for POWER8 and tested zgemm again
2016-04-22 13:46:22 +02:00
Werner Saar
879a51165f Optimized zgemm and tested zgemm again 2016-04-22 13:07:12 +02:00
Shivraj Patil
2c3dfe2bf3 MIPS P5600(32 bit) and I6400(64 bit) cores support added.
Seperated mips and mips64 files.
Configurations support for mips 32 bit.

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
2016-04-22 14:03:18 +05:30
Werner Saar
ef30e52c8f Merge pull request #846 from wernsaar/develop
Optimized sgemm and dgemm for POWER8
2016-04-21 13:52:24 +02:00
Werner Saar
dd2b897795 added bugfixes for some make files and smallscaling.c 2016-04-21 12:54:32 +02:00
Werner Saar
9276c9012f Optimized sgemm and dgemm and tested again. 2016-04-21 11:37:57 +02:00
Werner Saar
391584af85 optimized Makefile.power for POWER8 2016-04-20 15:28:28 +02:00
wernsaar
6fbca2a4a1 Merge pull request #845 from wernsaar/develop
optimized sgemm for power8
2016-04-20 13:44:22 +02:00
Werner Saar
0001260f4b optimized sgemm 2016-04-20 13:06:38 +02:00
Werner Saar
3c6294ca3d added optimized sgemm_tcopy for power8 2016-04-19 16:08:54 +02:00
Zhang Xianyi
dd43661cfd Init IBM z system (s390x) porting. 2016-04-15 18:02:24 -04:00
Zhang Xianyi
9253dadaa7 Bump to 0.2.19.dev. 2016-04-12 15:32:10 -04:00
Werner Saar
8037d78eed bugfix for arm scal.c and zscal.c 2016-04-11 11:21:36 +02:00
Werner Saar
1ca750471a added cholesky benchmarks to Makefile for ESSL 2016-04-10 11:28:20 +02:00
Zhang Xianyi
1367a64d09 Merge branch 'develop' of github.com:xianyi/OpenBLAS into arm_soft_fp_abi 2015-11-11 19:25:07 +00:00
Zhang Xianyi
ccf41ebf78 Merge branch 'develop' into arm_soft_fp_abi 2015-10-28 12:12:31 +00:00
Zhang Xianyi
857899526f ARM soft fp abi branch. 2015-09-26 14:10:18 +00:00
4520 changed files with 356777 additions and 75136 deletions

18
.gitignore vendored
View File

@@ -14,6 +14,21 @@ lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
lapack-netlib/INSTALL/test*
lapack-netlib/TESTING/xeigtstc
lapack-netlib/TESTING/xeigtstd
lapack-netlib/TESTING/xeigtsts
lapack-netlib/TESTING/xeigtstz
lapack-netlib/TESTING/xlintstc
lapack-netlib/TESTING/xlintstd
lapack-netlib/TESTING/xlintstds
lapack-netlib/TESTING/xlintstrfc
lapack-netlib/TESTING/xlintstrfd
lapack-netlib/TESTING/xlintstrfs
lapack-netlib/TESTING/xlintstrfz
lapack-netlib/TESTING/xlintsts
lapack-netlib/TESTING/xlintstz
lapack-netlib/TESTING/xlintstzc
*.so
*.so.*
*.a
@@ -69,3 +84,6 @@ test/zblat3
build
build.*
*.swp
benchmark/*.goto
benchmark/smallscaling

View File

@@ -2,16 +2,19 @@
## Author: Hank Anderson <hank@statease.com>
##
cmake_minimum_required(VERSION 2.8.4)
cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 2)
set(OpenBLAS_PATCH_VERSION 18)
set(OpenBLAS_PATCH_VERSION 20)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
enable_language(ASM)
enable_language(C)
# Adhere to GNU filesystem layout conventions
include(GNUInstallDirs)
if(MSVC)
set(OpenBLAS_LIBNAME libopenblas)
else()
@@ -30,10 +33,20 @@ set(NO_LAPACK 1)
set(NO_LAPACKE 1)
endif()
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
if(CMAKE_CONFIGURATION_TYPES) # multiconfig generator?
set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "" FORCE)
set(CMAKE_BUILD_TYPE
Debug Debug
Release Release
)
else()
set(CMAKE_BUILD_TYPE Release)
if( NOT CMAKE_BUILD_TYPE )
if(BUILD_DEBUG)
set(CMAKE_BUILD_TYPE Debug)
else()
set(CMAKE_BUILD_TYPE Release)
endif()
endif()
endif()
if(BUILD_WITHOUT_CBLAS)
@@ -45,8 +58,8 @@ endif()
message(WARNING "CMake support is experimental. This will not produce the same Makefiles that OpenBLAS ships with. Only x86 support is currently available.")
include("${CMAKE_SOURCE_DIR}/cmake/utils.cmake")
include("${CMAKE_SOURCE_DIR}/cmake/system.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/utils.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/system.cmake")
set(BLASDIRS interface driver/level2 driver/level3 driver/others)
@@ -107,9 +120,12 @@ if (${NO_STATIC} AND ${NO_SHARED})
endif ()
#Set default output directory
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib )
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
if(MSVC)
set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug)
set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release)
endif ()
# get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html)
set(TARGET_OBJS "")
foreach (SUBDIR ${SUBDIRS})
@@ -123,43 +139,50 @@ endforeach ()
# Can't just use lapack-netlib's CMake files, since they are set up to search for BLAS, build and install a binary. We just want to build a couple of lib files out of lapack and lapacke.
# Not using add_subdirectory here because lapack-netlib already has its own CMakeLists.txt. Instead include a cmake script with the sources we want.
if (NOT NOFORTRAN AND NOT NO_LAPACK)
include("${CMAKE_SOURCE_DIR}/cmake/lapack.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/lapack.cmake")
if (NOT NO_LAPACKE)
include("${CMAKE_SOURCE_DIR}/cmake/lapacke.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/lapacke.cmake")
endif ()
endif ()
#Only generate .def for dll on MSVC
# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
set(CMAKE_SHARED_LINKER_FLAGS_RELEASE "${CMAKE_SHARED_LINKER_FLAGS_RELEASE} /DEBUG /OPT:REF /OPT:ICF")
endif()
# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} SHARED ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
include("${CMAKE_SOURCE_DIR}/cmake/export.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
# Set output for libopenblas
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d")
foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES})
string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib)
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} )
endforeach()
enable_testing()
add_subdirectory(utest)
if(NOT MSVC)
#only build shared library for MSVC
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if (NOT MSVC)
#only build shared library for MSVC
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
add_library(${OpenBLAS_LIBNAME}_static STATIC ${LA_SOURCES} ${LAPACKE_SOURCES} ${TARGET_OBJS})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
set_target_properties(${OpenBLAS_LIBNAME}_static PROPERTIES CLEAN_DIRECT_OUTPUT 1)
if(SMP)
target_link_libraries(${OpenBLAS_LIBNAME} pthread)
target_link_libraries(${OpenBLAS_LIBNAME}_static pthread)
endif()
#build test and ctest
@@ -198,3 +221,73 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
#endif
# @touch lib.grd
# Install project
# Install libraries
install(TARGETS ${OpenBLAS_LIBNAME}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} )
# Install include files
set (GENCONFIG_BIN ${CMAKE_BINARY_DIR}/gen_config_h${CMAKE_EXECUTABLE_SUFFIX})
ADD_CUSTOM_COMMAND(
OUTPUT ${CMAKE_BINARY_DIR}/openblas_config.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
COMMAND ${GENCONFIG_BIN} ${CMAKE_CURRENT_SOURCE_DIR}/config.h ${CMAKE_CURRENT_SOURCE_DIR}/openblas_config_template.h > ${CMAKE_BINARY_DIR}/openblas_config.h
)
ADD_CUSTOM_TARGET(genconfig
ALL
DEPENDS openblas_config.h
)
add_dependencies(genconfig ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/openblas_config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(genf77blas
ALL
COMMAND ${AWK} 'BEGIN{print \"\#ifndef OPENBLAS_F77BLAS_H\" \; print \"\#define OPENBLAS_F77BLAS_H\" \; print \"\#include \\"openblas_config.h\\" \"}; NF {print}; END{print \"\#endif\"}' ${CMAKE_CURRENT_SOURCE_DIR}/common_interface.h > ${CMAKE_BINARY_DIR}/f77blas.h
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/config.h
)
add_dependencies(genf77blas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/f77blas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")
ADD_CUSTOM_TARGET(gencblas
ALL
COMMAND ${SED} 's/common/openblas_config/g' ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h > "${CMAKE_BINARY_DIR}/cblas.tmp"
COMMAND cp "${CMAKE_BINARY_DIR}/cblas.tmp" "${CMAKE_BINARY_DIR}/cblas.h"
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h
)
add_dependencies(gencblas ${OpenBLAS_LIBNAME})
install (FILES ${CMAKE_BINARY_DIR}/cblas.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT NO_LAPACKE)
message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}")
add_dependencies( ${OpenBLAS_LIBNAME} genlapacke)
FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h")
install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
ADD_CUSTOM_TARGET(genlapacke
COMMAND cp ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h"
)
install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()
if(NOT MSVC)
install (TARGETS ${OpenBLAS_LIBNAME}_static DESTINATION ${CMAKE_INSTALL_LIBDIR})
endif()
include(FindPkgConfig QUIET)
if(PKG_CONFIG_FOUND)
configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas.pc @ONLY)
install (FILES ${PROJECT_BINARY_DIR}/openblas.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/)
endif()

View File

@@ -150,3 +150,21 @@ In chronological order:
* theoractice <https://github.com/theoractice/>
* [2016-03-20] Fix compiler error in VisualStudio with CMake
* [2016-03-22] Fix access violation on Windows while static linking
* Paul Mustière <https://github.com/buffer51/>
* [2016-02-04] Fix Android build on ARMV7
* [2016-04-26] Android build with LAPACK for ARMV7 & ARMV8
* Shivraj Patil <https://github.com/sva-img/>
* [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA
* Kaustubh Raste <https://github.com/ksraste/>
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
* Abdelrauf <https://github.com/quickwritereader>
* [2017-01-01] dgemm and dtrmm kernels for IBM z13
* [2017-02-26] ztrmm kernel for IBM z13
* [2017-03-13] strmm and ctrmm kernel for IBM z13

View File

@@ -1,4 +1,63 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.20
24-Jul-2017
common:
* Improved CMake support
* Fixed several thread race and locking bugs
* Fixed default LAPACK optimization level
* Updated LAPACK to 3.7.0
* Added ReLAPACK (https://github.com/HPAC/ReLAPACK, make BUILD_RELAPACK=1)
POWER:
* Optimizations for Power9
* Fixed several Power8 assembly bugs
ARM:
* New optimized Vulcan and ThunderX2T99 targets
* Support for ARMV7 SOFT_FP ABI (make ARM_SOFTFP_ABI=1)
* Detect all cpu cores including offline ones
* Fix compilation with CLANG
* Support building a shared library for Android
MIPS:
* Fixed several threading issues
* Fix compilation with CLANG
x86_64:
* Detect Intel Bay Trail and Apollo Lake
* Detect Intel Sky Lake and Kaby Lake
* Detect Intel Knights Landing
* Detect AMD A8, A10, A12 and Ryzen
* Support 64bit builds with Visual Studio
* Fix building with Intel and PGI compilers
* Fix building with MINGW and TDM-GCC
* Fix cmake builds for Haswell and related cpus
* Fix building for Sandybridge with CLANG 3.9
* Add support for the FLANG compiler
IBM Z:
* New target z13 with BLAS3 optimizations
====================================================================
Version 0.2.19
1-Sep-2016
common:
* Improved cross compiling.
* Fix the bug on musl libc.
POWER:
* Optimize BLAS on Power8
* Fixed Julia+OpenBLAS bugs on Power8
MIPS:
* Optimize BLAS on MIPS P5600 and I6400 (Thanks, Shivraj Patil, Kaustubh Raste)
ARM:
* Improved on ARM Cortex-A57. (Thanks, Ashwin Sekhar T K)
====================================================================
Version 0.2.18
12-Apr-2016

View File

@@ -16,14 +16,19 @@ ifneq ($(NO_LAPACK), 1)
SUBDIRS += lapack
endif
RELA =
ifeq ($(BUILD_RELAPACK), 1)
RELA = re_lapack
endif
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
.PHONY : all libs netlib test ctest shared install
.NOTPARALLEL : all libs prof lapack-test install blas-test
.PHONY : all libs netlib $(RELA) test ctest shared install
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
all :: libs netlib tests shared
all :: libs netlib $(RELA) tests shared
@echo
@echo " OpenBLAS build complete. ($(LIB_COMPONENTS))"
@echo
@@ -81,7 +86,7 @@ endif
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@@ -108,8 +113,6 @@ endif
tests :
ifndef NOFORTRAN
ifndef TARGET
ifndef CROSS
touch $(LIBNAME)
ifndef NO_FBLAS
$(MAKE) -C test all
@@ -119,8 +122,6 @@ ifndef NO_CBLAS
$(MAKE) -C ctest all
endif
endif
endif
endif
libs :
ifeq ($(CORE), UNKOWN)
@@ -219,6 +220,14 @@ ifndef NO_LAPACKE
endif
endif
ifeq ($(NO_LAPACK), 1)
re_lapack :
else
re_lapack :
@$(MAKE) -C relapack
endif
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
@@ -282,13 +291,13 @@ lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
make -C $(NETLIB_LAPACK_DIR)/TIMING
$(MAKE) -C $(NETLIB_LAPACK_DIR)/TIMING
endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
@@ -303,7 +312,7 @@ lapack-runtest:
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
make -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
@@ -330,6 +339,7 @@ endif
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@$(MAKE) -C relapack clean
@rm -f *.grd Makefile.conf_last config_last.h
@(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out testing_results.txt)
@echo Done.

View File

@@ -1,31 +1,19 @@
# ifeq logical or
ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15))
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a
CCOMMON_OPT += -mfpu=neon -march=armv7-a
FCOMMON_OPT += -mfpu=neon -march=armv7-a
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV7)
ifeq ($(OSNAME), Android)
CCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
FCOMMON_OPT += -marm -mfpu=neon -mfloat-abi=hard -march=armv7-a -Wl,--no-warn-mismatch
else
CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
endif
endif
ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
FCOMMON_OPT += -marm -mfpu=vfp -mfloat-abi=hard -march=armv6
CCOMMON_OPT += -mfpu=vfp -march=armv6
FCOMMON_OPT += -mfpu=vfp -march=armv6
endif
ifeq ($(CORE), ARMV5)
CCOMMON_OPT += -marm -march=armv5
FCOMMON_OPT += -marm -march=armv5
CCOMMON_OPT += -march=armv5
FCOMMON_OPT += -march=armv5
endif

View File

@@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif
ifeq ($(CORE), VULCAN)
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
FCOMMON_OPT += -mtune=thunderx2t99 -mcpu=thunderx2t99
endif

View File

@@ -12,6 +12,7 @@ OPENBLAS_BUILD_DIR := $(CURDIR)
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
.PHONY : install
.NOTPARALLEL : install
@@ -20,110 +21,122 @@ lib.grd :
$(error OpenBLAS: Please run "make" firstly)
install : lib.grd
@-mkdir -p $(DESTDIR)$(PREFIX)
@-mkdir -p $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-mkdir -p $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@-mkdir -p "$(DESTDIR)$(PREFIX)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
#for inc
@echo \#ifndef OPENBLAS_CONFIG_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_CONFIG_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@cat openblas_config_template.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h
@echo \#ifndef OPENBLAS_CONFIG_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_CONFIG_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@$(AWK) 'NF {print $$1, "OPENBLAS_"$$2, $$3}' config_last.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#define OPENBLAS_VERSION \" OpenBLAS $(VERSION) \" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@cat openblas_config_template.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo \#endif \/\* OPENBLAS_CONFIG_H \*\/ >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/openblas_config.h"
@echo Generating f77blas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@echo \#ifndef OPENBLAS_F77BLAS_H > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#define OPENBLAS_F77BLAS_H >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#include \"openblas_config.h\" >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@cat common_interface.h >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#endif >> $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#ifndef OPENBLAS_F77BLAS_H > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#define OPENBLAS_F77BLAS_H >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#include \"openblas_config.h\" >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@cat common_interface.h >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
@echo \#endif >> "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/f77blas.h"
ifndef NO_CBLAS
@echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h
@sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h"
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifndef NO_STATIC
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
@install -pm755 $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
@cp $(LIBSONAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
@-cp $(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@cd $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ; \
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
@-cp $(LIBDLLNAME) $(DESTDIR)$(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME).a $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
@-cp $(LIBDLLNAME).a "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
endif
ifeq ($(OSNAME), CYGWIN_NT)
@-cp $(LIBDLLNAME) $(OPENBLAS_BINARY_DIR)
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
endif
endif
#Generating openblas.pc
@echo Generating openblas.pc in $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'version='$(VERSION) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@echo 'extralib='$(EXTRALIB) >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
@cat openblas.pc.in >> $(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc
#Generating OpenBLASConfig.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_BINARY_DIR}/$(LIBDLLNAME))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), Darwin)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).dylib)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
else
#only static
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).$(LIBSUFFIX))" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
#Generating OpenBLASConfigVersion.cmake
@echo Generating $(OPENBLAS_CMAKE_CONFIG_VERSION) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "else ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo " endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "endif ()" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)
@echo "set (PACKAGE_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE FALSE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "else ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_COMPATIBLE TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " set (PACKAGE_VERSION_EXACT TRUE)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
@echo Install OK!

3
Makefile.mips Normal file
View File

@@ -0,0 +1,3 @@
ifdef BINARY64
else
endif

View File

@@ -1,4 +1,26 @@
# CCOMMON_OPT += -DALLOC_SHM
ifdef USE_THREAD
ifeq ($(USE_THREAD), 0)
USE_OPENMP = 0
else
USE_OPENMP = 1
endif
else
USE_OPENMP = 1
endif
ifeq ($(CORE), POWER8)
ifeq ($(USE_OPENMP), 1)
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp
else
COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math
endif
endif
FLAMEPATH = $(HOME)/flame/lib
@@ -16,6 +38,16 @@ else
endif
endif
#Either uncomment below line or run make with `USE_MASS=1` to enable support of MASS library
#USE_MASS = 1
ifeq ($(USE_MASS), 1)
# Path to MASS libs, change it if the libs are installed at any other location
MASSPATH = /opt/ibm/xlmass/8.1.5/lib
COMMON_OPT += -mveclibabi=mass -ftree-vectorize -funsafe-math-optimizations -DUSE_MASS
EXTRALIB += -L$(MASSPATH) -lmass -lmassvp8 -lmass_simdp8
endif
ifdef BINARY64

View File

@@ -17,14 +17,26 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5
endif
ifeq ($(TARGET), I6400)
TARGET_FLAGS = -mips64r6
endif
ifeq ($(TARGET), P6600)
TARGET_FLAGS = -mips64r6
endif
all: getarch_2nd
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
config.h : c_check f_check getarch
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC)
perl ./c_check $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS)
ifneq ($(ONLY_CBLAS), 1)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC)
perl ./f_check $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
else
#When we only build CBLAS, we set NOFORTRAN=2
echo "NOFORTRAN=2" >> $(TARGET_MAKE)

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.18
VERSION = 0.2.20
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -52,6 +52,7 @@ VERSION = 0.2.18
# USE_THREAD = 0
# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# USE_OPENMP = 1
# You can define maximum number of threads. Basically it should be
@@ -82,6 +83,9 @@ VERSION = 0.2.18
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# If you want to use legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -96,7 +100,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
NO_AFFINITY = 1
#NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1
@@ -153,10 +157,12 @@ NO_AFFINITY = 1
# Common Optimization Flag;
# The default -O2 is enough.
# Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
# COMMON_OPT = -O2
# gfortran option for LAPACK
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
# FCOMMON_OPT = -frecursive
# Profiling flags

View File

@@ -68,6 +68,9 @@ endif
ifeq ($(TARGET), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -98,6 +101,9 @@ endif
ifeq ($(TARGET_CORE), EXCAVATOR)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
@@ -159,7 +165,7 @@ ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) all)
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
@@ -217,7 +223,9 @@ endif
#
ifeq ($(OSNAME), Darwin)
ifndef MACOSX_DEPLOYMENT_TARGET
export MACOSX_DEPLOYMENT_TARGET=10.6
endif
MD5SUM = md5 -r
endif
@@ -234,6 +242,10 @@ EXTRALIB += -lm
NO_EXPRECISION = 1
endif
ifeq ($(OSNAME), Android)
EXTRALIB += -lm
endif
ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
@@ -406,7 +418,6 @@ CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), CLANG)
$(error OpenBLAS: Clang didn't support OpenMP yet.)
CCOMMON_OPT += -fopenmp
endif
@@ -441,12 +452,13 @@ ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER EXCAVATOR
endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL
DYNAMIC_CORE += HASWELL ZEN
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE
DYNAMIC_ARCH =
override DYNAMIC_ARCH=
endif
endif
@@ -462,7 +474,7 @@ endif
endif
endif
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
NO_BINARY_MODE = 1
endif
@@ -474,6 +486,23 @@ endif
ifeq ($(ARCH), arm)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
CCOMMON_OPT += -marm
FCOMMON_OPT += -marm
# If softfp abi is mentioned on the command line, force it.
ifeq ($(ARM_SOFTFP_ABI), 1)
CCOMMON_OPT += -mfloat-abi=softfp
FCOMMON_OPT += -mfloat-abi=softfp
endif
ifeq ($(OSNAME), Android)
ifeq ($(ARM_SOFTFP_ABI), 1)
EXTRALIB += -lm
else
EXTRALIB += -Wl,-lm_hard
endif
endif
endif
ifeq ($(ARCH), arm64)
@@ -502,13 +531,16 @@ endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
CCOMMON_OPT += -mabi=64
else
CCOMMON_OPT += -mabi=n32
endif
BINARY_DEFINED = 1
else ifeq ($(ARCH), $(filter $(ARCH),mips))
CCOMMON_OPT += -mabi=32
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
@@ -521,6 +553,21 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif
ifeq ($(CORE), I6400)
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=i6400 $(MSA_FLAGS)
endif
ifeq ($(CORE), P6600)
CCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
FCOMMON_OPT += -mips64r6 -mnan=2008 -mtune=p6600 $(MSA_FLAGS)
endif
ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
@@ -557,6 +604,23 @@ endif
# Fortran Compiler dependent settings
#
ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -Wall
else
FCOMMON_OPT += -Wall
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -fopenmp
endif
endif
ifeq ($(F_COMPILER), G77)
CCOMMON_OPT += -DF_INTERFACE_G77
FCOMMON_OPT += -Wall
@@ -589,12 +653,14 @@ ifneq ($(NO_LAPACK), 1)
EXTRALIB += -lgfortran
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64
FCOMMON_OPT += -mabi=64
else
FCOMMON_OPT += -mabi=n32
endif
else ifeq ($(ARCH), $(filter $(ARCH),mips))
FCOMMON_OPT += -mabi=32
endif
else
ifdef BINARY64
@@ -677,21 +743,7 @@ FCOMMON_OPT += -i8
endif
endif
endif
ifneq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
else
ifdef BINARY64
FCOMMON_OPT += -mabi=64
else
FCOMMON_OPT += -mabi=n32
endif
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
@@ -707,7 +759,7 @@ endif
endif
endif
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
FCOMMON_OPT += -n32
else
@@ -737,7 +789,7 @@ endif
ifeq ($(C_COMPILER), OPEN64)
ifeq ($(ARCH), mips64)
ifeq ($(ARCH), $(filter $(ARCH),mips64 mips))
ifndef BINARY64
CCOMMON_OPT += -n32
else
@@ -996,7 +1048,7 @@ endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(FCOMMON_OPT)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
@@ -1077,6 +1129,9 @@ LIB_COMPONENTS += LAPACK
ifneq ($(NO_LAPACKE), 1)
LIB_COMPONENTS += LAPACKE
endif
ifeq ($(BUILD_RELAPACK), 1)
LIB_COMPONENTS += ReLAPACK
endif
endif
ifeq ($(ONLY_CBLAS), 1)
@@ -1126,6 +1181,8 @@ export HAVE_VFP
export HAVE_VFPV3
export HAVE_VFPV4
export HAVE_NEON
export HAVE_MSA
export MSA_FLAGS
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE

6
Makefile.zarch Normal file
View File

@@ -0,0 +1,6 @@
ifeq ($(CORE), Z13)
CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif

View File

@@ -43,6 +43,35 @@ On X86 box, compile this library for loongson3a CPU with loongcc (based on Open6
make DEBUG=1
### Compile with MASS Support on Power CPU (Optional dependency)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
The library can be installed as below -
* On Ubuntu:
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
sudo apt-get update</br>
sudo apt-get install libxlmass-devel.8.1.5</br>
* On RHEL/CentOS:
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
After installing MASS library, compile openblas with USE_MASS=1.
Example:
Compiling on Power8 with MASS support -
make USE_MASS=1 TARGET=POWER8
### Install to the directory (optional)
Example:
@@ -77,11 +106,16 @@ Please read GotoBLAS_01Readme.txt
- **ARMV8**: Experimental
- **ARM Cortex-A57**: Experimental
#### IBM zEnterprise System:
- **Z13**: Optimized Level-3 BLAS
### Support OS:
- **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages
Link with libopenblas.a or -lopenblas for shared library.

View File

@@ -34,6 +34,7 @@ BULLDOZER
PILEDRIVER
STEAMROLLER
EXCAVATOR
ZEN
c)VIA CPU:
SSE_GENERIC
@@ -53,26 +54,34 @@ PPC440
PPC440FP2
CELL
3.MIPS64 CPU:
3.MIPS CPU:
P5600
4.MIPS64 CPU:
SICORTEX
LOONGSON3A
LOONGSON3B
I6400
P6600
4.IA64 CPU:
5.IA64 CPU:
ITANIUM2
5.SPARC CPU:
6.SPARC CPU:
SPARC
SPARCV7
6.ARM CPU:
7.ARM CPU:
CORTEXA15
CORTEXA9
ARMV7
ARMV6
ARMV5
7.ARM 64-bit CPU:
8.ARM 64-bit CPU:
ARMV8
CORTEXA57
VULCAN
THUNDERX
THUNDERX2T99

View File

@@ -1,4 +1,4 @@
version: 0.2.18.{build}
version: 0.2.19.{build}
#environment:

View File

@@ -37,6 +37,18 @@ ESSL=/opt/ibm/lib
#LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
ifneq ($(NO_LAPACK), 1)
GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
csymv.goto zsymv.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto
else
GOTO_LAPACK_TARGETS=
endif
ifeq ($(OSNAME), WINNT)
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
@@ -147,9 +159,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
else
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \
strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \
ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \
@@ -162,18 +172,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
sswap.goto dswap.goto cswap.goto zswap.goto \
sscal.goto dscal.goto cscal.goto zscal.goto \
sasum.goto dasum.goto casum.goto zasum.goto \
ssymv.goto dsymv.goto csymv.goto zsymv.goto \
ssymv.goto dsymv.goto \
chemv.goto zhemv.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling
smallscaling \
isamax.goto idamax.goto icamax.goto izamax.goto \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
@@ -226,7 +234,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \
spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas
ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \
isamax.atlas idamax.atlas icamax.atlas izamax.atlas \
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto
mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \
@@ -261,7 +271,9 @@ endif
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl
slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl \
scholesky.essl ccholesky.essl dcholesky.essl zcholesky.essl \
strsm.essl dtrsm.essl ctrsm.essl ztrsm.essl
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
@@ -393,6 +405,9 @@ scholesky.mkl : scholesky.$(SUFFIX)
scholesky.veclib : scholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
scholesky.essl : scholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dcholesky ###################################################
dcholesky.goto : dcholesky.$(SUFFIX) ../$(LIBNAME)
@@ -410,6 +425,9 @@ dcholesky.mkl : dcholesky.$(SUFFIX)
dcholesky.veclib : dcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dcholesky.essl : dcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ccholesky ###################################################
ccholesky.goto : ccholesky.$(SUFFIX) ../$(LIBNAME)
@@ -427,6 +445,9 @@ ccholesky.mkl : ccholesky.$(SUFFIX)
ccholesky.veclib : ccholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ccholesky.essl : ccholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zcholesky ###################################################
@@ -445,6 +466,9 @@ zcholesky.mkl : zcholesky.$(SUFFIX)
zcholesky.veclib : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zcholesky.essl : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgemm ####################################################
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -683,6 +707,9 @@ strsm.mkl : strsm.$(SUFFIX)
strsm.veclib : strsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
strsm.essl : strsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dtrsm ####################################################
dtrsm.goto : dtrsm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -699,6 +726,9 @@ dtrsm.mkl : dtrsm.$(SUFFIX)
dtrsm.veclib : dtrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtrsm.essl : dtrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ctrsm ####################################################
ctrsm.goto : ctrsm.$(SUFFIX) ../$(LIBNAME)
@@ -716,6 +746,9 @@ ctrsm.mkl : ctrsm.$(SUFFIX)
ctrsm.veclib : ctrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctrsm.essl : ctrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ztrsm ####################################################
ztrsm.goto : ztrsm.$(SUFFIX) ../$(LIBNAME)
@@ -733,6 +766,9 @@ ztrsm.mkl : ztrsm.$(SUFFIX)
ztrsm.veclib : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztrsm.essl : ztrsm.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ssyrk ####################################################
ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@@ -1911,6 +1947,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX)
zgemm3m.veclib : zgemm3m.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ISAMAX ##############################################
isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
isamax.atlas : isamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IDAMAX ##############################################
idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
idamax.atlas : idamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## ICAMAX ##############################################
icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
icamax.atlas : icamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## IZAMAX ##############################################
izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
izamax.atlas : izamax.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## SNRM2 ##############################################
snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
snrm2.atlas : snrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## DNRM2 ##############################################
dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dnrm2.atlas : dnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Sscnrm2 ##############################################
scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
scnrm2.atlas : scnrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
############################################## Ddznrm2 ##############################################
dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dznrm2.atlas : dznrm2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
###################################################################################################
slinpack.$(SUFFIX) : linpack.c
@@ -2217,11 +2310,38 @@ cgemm3m.$(SUFFIX) : gemm3m.c
zgemm3m.$(SUFFIX) : gemm3m.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
isamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
idamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
icamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
izamax.$(SUFFIX) : iamax.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
snrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
scnrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
dznrm2.$(SUFFIX) : nrm2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
smallscaling: smallscaling.c ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread
clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
include $(TOPDIR)/Makefile.tail

View File

@@ -183,9 +183,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -221,7 +221,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}
@@ -258,7 +258,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg);
}
}

192
benchmark/iamax.c Normal file
View File

@@ -0,0 +1,192 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMAX
#ifdef COMPLEX
#ifdef DOUBLE
#define IAMAX BLASFUNC(izamax)
#else
#define IAMAX BLASFUNC(icamax)
#endif
#else
#ifdef DOUBLE
#define IAMAX BLASFUNC(idamax)
#else
#define IAMAX BLASFUNC(isamax)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

193
benchmark/nrm2.c Normal file
View File

@@ -0,0 +1,193 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NRM2
#ifdef COMPLEX
#ifdef DOUBLE
#define NRM2 BLASFUNC(dznrm2)
#else
#define NRM2 BLASFUNC(scnrm2)
#endif
#else
#ifdef DOUBLE
#define NRM2 BLASFUNC(dnrm2)
#else
#define NRM2 BLASFUNC(snrm2)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x;
blasint m, i;
blasint inc_x=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
NRM2 (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@@ -186,8 +186,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -189,9 +189,9 @@ int main(int argc, char *argv[]){
timeg /= loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);
#else
fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6);
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg);
#endif
}

View File

@@ -2,61 +2,54 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
start <- proc.time()[3]
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
while ( l <= loops ) {
ev <- eigen(A)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (26.66 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

View File

@@ -2,62 +2,63 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
C <- 1
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
z <- system.time(for (l in 1:loops) {
C <- A %*% B
l <- l + 1
})
start <- proc.time()[3]
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
while ( l <= loops ) {
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
C <- A %*% B
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = ( 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

View File

@@ -2,62 +2,56 @@
argv <- commandArgs(trailingOnly = TRUE)
nfrom = 128
nto = 2048
nstep = 128
loops = 1
nfrom <- 128
nto <- 2048
nstep <- 128
loops <- 1
if ( length(argv) > 0 ) {
for ( z in 1:length(argv) ) {
if ( z == 1 ) {
nfrom <- as.numeric(argv[z])
} else if ( z==2 ) {
nto <- as.numeric(argv[z])
} else if ( z==3 ) {
nstep <- as.numeric(argv[z])
} else if ( z==4 ) {
loops <- as.numeric(argv[z])
}
}
if (length(argv) > 0) {
for (z in 1:length(argv)) {
if (z == 1) {
nfrom <- as.numeric(argv[z])
} else if (z == 2) {
nto <- as.numeric(argv[z])
} else if (z == 3) {
nstep <- as.numeric(argv[z])
} else if (z == 4) {
loops <- as.numeric(argv[z])
}
}
}
p=Sys.getenv("OPENBLAS_LOOPS")
if ( p != "" ) {
loops <- as.numeric(p)
}
p <- Sys.getenv("OPENBLAS_LOOPS")
if (p != "") {
loops <- as.numeric(p)
}
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n",nfrom, nto, nstep, loops))
cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf(" SIZE Flops Time\n"))
n = nfrom
while ( n <= nto ) {
n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
B <- matrix(runif(n*n), ncol = n, nrow = n, byrow = TRUE)
l = 1
z <- system.time(for (l in 1:loops) {
solve(A, B)
})
start <- proc.time()[3]
mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
while ( l <= loops ) {
st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))
solve(A,B)
l = l + 1
}
end <- proc.time()[3]
timeg = end - start
mflops = (2.0/3.0 *n*n*n + 2.0 *n*n*n ) * loops / ( timeg * 1.0e6 )
st = sprintf("%.0fx%.0f :",n , n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, timeg))
n = n + nstep
n <- n + nstep
}

View File

@@ -5,6 +5,7 @@
#include <time.h>
#include <cblas.h>
#include <omp.h>
#include <pthread.h>
#define MIN_SIZE 5
#define MAX_SIZE 60
#define NB_SIZE 10

View File

@@ -190,8 +190,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6);
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

View File

@@ -191,8 +191,8 @@ int main(int argc, char *argv[]){
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1);
}

View File

@@ -184,8 +184,8 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6);
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg);
}

65
c_check
View File

@@ -1,5 +1,8 @@
#!/usr/bin/perl
use File::Basename;
use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
@@ -7,7 +10,9 @@ $hostarch = "x86_64" if ($hostarch eq "amd64");
$hostarch = "arm" if ($hostarch =~ /^arm.*/);
$hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");
$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};
$makefile = shift(@ARGV);
@@ -26,14 +31,12 @@ if ($?) {
$cross_suffix = "";
if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
if ($1 =~ /(.*-)(.*)/) {
$cross_suffix = $1;
}
} else {
if ($ARGV[0] =~ /([^\/]*-)([^\/]*$)/) {
$cross_suffix = $1;
}
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
$compiler = "";
@@ -63,13 +66,14 @@ $os = Android if ($data =~ /OS_ANDROID/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$defined = 0;
@@ -79,7 +83,12 @@ if ($os eq "AIX") {
$defined = 1;
}
if (($architecture eq "mips32") || ($architecture eq "mips64")) {
if ($architecture eq "mips") {
$compiler_name .= " -mabi=32";
$defined = 1;
}
if ($architecture eq "mips64") {
$compiler_name .= " -mabi=n32" if ($binary eq "32");
$compiler_name .= " -mabi=64" if ($binary eq "64");
$defined = 1;
@@ -89,6 +98,11 @@ if (($architecture eq "arm") || ($architecture eq "arm64")) {
$defined = 1;
}
if ($architecture eq "zarch") {
$defined = 1;
$binary = 64;
}
if ($architecture eq "alpha") {
$defined = 1;
$binary = 64;
@@ -152,16 +166,35 @@ if ($?) {
die 1;
}
$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips32 if ($data =~ /ARCH_MIPS32/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
@@ -209,6 +242,11 @@ $linker_a = "";
$linker_L .= "-Wl,". $flags . " "
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /gfortranbegin/)
@@ -243,9 +281,11 @@ print MAKEFILE "BINARY64=\n" if $binformat ne bin64;
print MAKEFILE "BINARY32=1\n" if $binformat eq bin32;
print MAKEFILE "BINARY64=1\n" if $binformat eq bin64;
print MAKEFILE "FU=$need_fu\n" if $need_fu ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross_suffix ne "";
print MAKEFILE "CROSS_SUFFIX=$cross_suffix\n" if $cross != 0 && $cross_suffix ne "";
print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;
@@ -257,6 +297,7 @@ print CONFFILE "#define C_$compiler\t1\n";
print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32;
print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
if ($os eq "LINUX") {

View File

@@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE "${DYNAMIC_CORE} SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER")
endif ()
if (NOT NO_AVX2)
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL")
set(DYNAMIC_CORE "${DYNAMIC_CORE} HASWELL ZEN")
endif ()
endif ()

View File

@@ -73,6 +73,10 @@ if (${ARCH} STREQUAL "X86")
set(ARCH x86)
endif ()
if (${ARCH} MATCHES "ppc")
set(ARCH power)
endif ()
set(COMPILER_ID ${CMAKE_CXX_COMPILER_ID})
if (${COMPILER_ID} STREQUAL "GNU")
set(COMPILER_ID "GCC")
@@ -87,3 +91,8 @@ file(WRITE ${TARGET_CONF}
"#define __${BINARY}BIT__\t1\n"
"#define FUNDERSCORE\t${FU}\n")
if (${HOST_OS} STREQUAL "WINDOWSSTORE")
file(APPEND ${TARGET_CONF}
"#define OS_WINNT\t1\n")
endif ()

View File

@@ -53,7 +53,7 @@ endif()
add_custom_command(
TARGET ${OpenBLAS_LIBNAME} PRE_LINK
COMMAND perl
ARGS "${CMAKE_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
ARGS "${PROJECT_SOURCE_DIR}/exports/gensymbol" "win2k" "${ARCH_IN}" "dummy" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" "${SYMBOLPREFIX}" "${SYMBOLSUFFIX}" > "${PROJECT_BINARY_DIR}/openblas.def"
COMMENT "Create openblas.def file"
VERBATIM)

View File

@@ -3,6 +3,21 @@
## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "FLANG")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")
endif ()
if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif ()
endif ()
if (${F_COMPILER} STREQUAL "G77")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_G77")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall")

View File

@@ -2,7 +2,7 @@
set(ALLAUX
ilaenv.f ieeeck.f lsamen.f xerbla_array.f iparmq.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f chla_transtype.f
ilaprec.f ilatrans.f ilauplo.f iladiag.f iparam2stage.F chla_transtype.f
../INSTALL/ilaver.f ../INSTALL/slamch.f
)
@@ -26,7 +26,7 @@ set(SCLAUX
)
set(DZLAUX
dbdsdc.f
dbdsdc.f dbdsvdx.f
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f
@@ -42,20 +42,28 @@ set(DZLAUX
dsteqr.f dsterf.f dlaisnan.f disnan.f
dlartgp.f dlartgs.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f
dgetsls.f dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f
dtplqt2.f dtpmlqt.f dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f
dsytf2_rk.f dlasyf_rk.f dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f
dsytri_3x.f dsysv_rk.f dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f
dsbevx_2stage.f dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f
dsyevx_2stage.f dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F
dsytrd_sy2sb.f dlarfy.f
)
set(SLASRC
sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sbdsvdx.f sgbbrd.f sgbcon.f sgbequ.f sgbrfs.f sgbsv.f
sgbsvx.f sgbtf2.f sgbtrf.f sgbtrs.f sgebak.f sgebal.f sgebd2.f
sgebrd.f sgecon.f sgeequ.f sgees.f sgeesx.f sgeev.f sgeevx.f
DEPRECATED/sgegs.f DEPRECATED/sgegv.f sgehd2.f sgehrd.f sgelq2.f sgelqf.f
sgels.f sgelsd.f sgelss.f DEPRECATED/sgelsx.f sgelsy.f sgeql2.f sgeqlf.f
sgeqp3.f DEPRECATED/sgeqpf.f sgeqr2.f sgeqr2p.f sgeqrf.f sgeqrfp.f sgerfs.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvx.f
sgetc2.f sgetri.f
sggbak.f sggbal.f sgges.f sggesx.f sggev.f sggevx.f
sgerq2.f sgerqf.f sgesc2.f sgesdd.f sgesvd.f sgesvdx.f sgesvx.f
sgetc2.f sgetri.f sgetrf2.f
sggbak.f sggbal.f sgghd3.f sgges.f sgges3.f sggesx.f sggev.f sggev3.f sggevx.f
sggglm.f sgghrd.f sgglse.f sggqrf.f
sggrqf.f DEPRECATED/sggsvd.f DEPRECATED/sggsvp.f sgtcon.f sgtrfs.f sgtsv.f
sggrqf.f DEPRECATED/sggsvd.f sggsvd3.f DEPRECATED/sggsvp.f sggsvp3.f sgtcon.f sgtrfs.f sgtsv.f
sgtsvx.f sgttrf.f sgttrs.f sgtts2.f shgeqz.f
shsein.f shseqr.f slabrd.f slacon.f slacn2.f
slaein.f slaexc.f slag2.f slags2.f slagtm.f slagv2.f slahqr.f
@@ -72,7 +80,7 @@ set(SLASRC
slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f DEPRECATED/slatzm.f
sopgtr.f sopmtr.f sorg2l.f sorg2r.f
sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f
sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
spbstf.f spbsv.f spbsvx.f
@@ -96,7 +104,7 @@ set(SLASRC
stbrfs.f stbtrs.f stgevc.f stgex2.f stgexc.f stgsen.f
stgsja.f stgsna.f stgsy2.f stgsyl.f stpcon.f stprfs.f stptri.f
stptrs.f
strcon.f strevc.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strcon.f strevc.f strevc3.f strexc.f strrfs.f strsen.f strsna.f strsyl.f
strtrs.f DEPRECATED/stzrqf.f stzrzf.f sstemr.f
slansf.f spftrf.f spftri.f spftrs.f ssfrk.f stfsm.f stftri.f stfttp.f
stfttr.f stpttf.f stpttr.f strttf.f strttp.f
@@ -106,9 +114,16 @@ set(SLASRC
sorbdb5.f sorbdb6.f sorcsd.f sorcsd2by1.f
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f spotri.f
sgelq.f sgelqt.f sgelqt3.f sgemlq.f sgemlqt.f sgemqr.f sgeqr.f sgetsls.f
slamswlq.f slamtsqr.f slaswlq.f slatsqr.f stplqt.f stplqt2.f stpmlqt.f
ssysv_aa.f ssytrf_aa.f ssytrs_aa.f slasyf_aa.f ssytf2_rk.f slasyf_rk.f
ssytrf_rk.f ssytrs_3.f ssycon_3.f ssytri_3.f ssytri_3x.f ssysv_rk.f
ssb2st_kernels.f ssbev_2stage.f ssbevd_2stage.f ssbevx_2stage.f
ssyev_2stage.f ssyevd_2stage.f ssyevr_2stage.f ssyevx_2stage.f
ssygv_2stage.f ssytrd_2stage.f ssytrd_sb2st.F ssytrd_sy2sb.f slarfy.f
)
set(DSLASRC spotrs.f)
set(DSLASRC spotrs.f spotrf2.f)
set(CLASRC
cbdsqr.f cgbbrd.f cgbcon.f cgbequ.f cgbrfs.f cgbsv.f cgbsvx.f
@@ -165,7 +180,7 @@ set(CLASRC
ctbcon.f ctbrfs.f ctbtrs.f ctgevc.f ctgex2.f
ctgexc.f ctgsen.f ctgsja.f ctgsna.f ctgsy2.f ctgsyl.f ctpcon.f
ctprfs.f ctptri.f
ctptrs.f ctrcon.f ctrevc.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
ctrsyl.f ctrtrs.f DEPRECATED/ctzrqf.f ctzrzf.f cung2l.f cung2r.f
cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f
@@ -178,6 +193,14 @@ set(CLASRC
cunbdb5.f cunbdb6.f cuncsd.f cuncsd2by1.f
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cpotri.f
cgelq.f cgelqt.f cgelqt3.f cgemlq.f cgemlqt.f cgemqr.f cgeqr.f cgetsls.f
clamswlq.f clamtsqr.f claswlq.f clatsqr.f ctplqt.f ctplqt2.f ctpmlqt.f
chesv_aa.f chetrf_aa.f chetrs_aa.f clahef_aa.f csytf2_rk.f clasyf_rk.f
csytrf_rk.f csytrs_3.f csycon_3.f csytri_3.f csytri_3x.f csysv_rk.f
chetf2_rk.f clahef_rk.f chetrf_rk.f chetrs_3.f checon_3.f chetri_3.f
chetri_3x.f chesv_rk.f chb2st_kernels.f chbev_2stage.f chbevd_2stage.f
chbevx_2stage.f cheev_2stage.f cheevd_2stage.f cheevr_2stage.f cheevx_2stage.f
chegv_2stage.f chetrd_2stage.f chetrd_hb2st.F chetrd_he2hb.f clarfy.f
)
set(ZCLASRC cpotrs.f)
@@ -189,11 +212,11 @@ set(DLASRC
DEPRECATED/dgegs.f DEPRECATED/dgegv.f dgehd2.f dgehrd.f dgelq2.f dgelqf.f
dgels.f dgelsd.f dgelss.f DEPRECATED/dgelsx.f dgelsy.f dgeql2.f dgeqlf.f
dgeqp3.f DEPRECATED/dgeqpf.f dgeqr2.f dgeqr2p.f dgeqrf.f dgeqrfp.f dgerfs.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvx.f
dgetc2.f dgetri.f
dggbak.f dggbal.f dgges.f dggesx.f dggev.f dggevx.f
dggglm.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgerq2.f dgerqf.f dgesc2.f dgesdd.f dgesvd.f dgesvdx.f dgesvx.f
dgetc2.f dgetri.f dgetrf2.f
dggbak.f dggbal.f dgges.f dgges3.f dggesx.f dggev.f dggev3.f dggevx.f
dggglm.f dgghd3.f dgghrd.f dgglse.f dggqrf.f
dggrqf.f dggsvd3.f dggsvp3.f DEPRECATED/dggsvd.f DEPRECATED/dggsvp.f dgtcon.f dgtrfs.f dgtsv.f
dgtsvx.f dgttrf.f dgttrs.f dgtts2.f dhgeqz.f
dhsein.f dhseqr.f dlabrd.f dlacon.f dlacn2.f
dlaein.f dlaexc.f dlag2.f dlags2.f dlagtm.f dlagv2.f dlahqr.f
@@ -210,12 +233,12 @@ set(DLASRC
dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f DEPRECATED/dlatzm.f
dopgtr.f dopmtr.f dorg2l.f dorg2r.f
dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f
dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
dpbstf.f dpbsv.f dpbsvx.f
dpbtf2.f dpbtrf.f dpbtrs.f dpocon.f dpoequ.f dporfs.f dposv.f
dposvx.f dpotrs.f dpstrf.f dpstf2.f
dposvx.f dpotrf2.f dpotrs.f dpstrf.f dpstf2.f
dppcon.f dppequ.f
dpprfs.f dppsv.f dppsvx.f dpptrf.f dpptri.f dpptrs.f dptcon.f
dpteqr.f dptrfs.f dptsv.f dptsvx.f dpttrs.f dptts2.f drscl.f
@@ -234,7 +257,7 @@ set(DLASRC
dtbcon.f dtbrfs.f dtbtrs.f dtgevc.f dtgex2.f dtgexc.f dtgsen.f
dtgsja.f dtgsna.f dtgsy2.f dtgsyl.f dtpcon.f dtprfs.f dtptri.f
dtptrs.f
dtrcon.f dtrevc.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrcon.f dtrevc.f dtrevc3.f dtrexc.f dtrrfs.f dtrsen.f dtrsna.f dtrsyl.f
dtrtrs.f DEPRECATED/dtzrqf.f dtzrzf.f dstemr.f
dsgesv.f dsposv.f dlag2s.f slag2d.f dlat2s.f
dlansf.f dpftrf.f dpftri.f dpftrs.f dsfrk.f dtfsm.f dtftri.f dtfttp.f
@@ -245,20 +268,28 @@ set(DLASRC
dorbdb5.f dorbdb6.f dorcsd.f dorcsd2by1.f
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dpotri.f
dgelq.f dgelqt.f dgelqt3.f dgemlq.f dgemlqt.f dgemqr.f dgeqr.f dgetsls.f
dlamswlq.f dlamtsqr.f dlaswlq.f dlatsqr.f dtplqt.f dtplqt2.f dtpmlqt.f
dsysv_aa.f dsytrf_aa.f dsytrs_aa.f dlasyf_aa.f dsytf2_rk.f dlasyf_rk.f
dsytrf_rk.f dsytrs_3.f dsycon_3.f dsytri_3.f dsytri_3x.f dsysv_rk.f
dsb2st_kernels.f dsbev_2stage.f dsbevd_2stage.f dsbevx_2stage.f
dsyev_2stage.f dsyevd_2stage.f dsyevr_2stage.f dsyevx_2stage.f
dsygv_2stage.f dsytrd_2stage.f dsytrd_sb2st.F dsytrd_sy2sb.f dlarfy.f
)
set(ZLASRC
zbdsqr.f zgbbrd.f zgbcon.f zgbequ.f zgbrfs.f zgbsv.f zgbsvx.f
zgbtf2.f zgbtrf.f zgbtrs.f zgebak.f zgebal.f zgebd2.f zgebrd.f
zgecon.f zgeequ.f zgees.f zgeesx.f zgeev.f zgeevx.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgelq2.f zgelqf.f
DEPRECATED/zgegs.f DEPRECATED/zgegv.f zgehd2.f zgehrd.f zgejsv.f zgelq2.f zgelqf.f
zgels.f zgelsd.f zgelss.f DEPRECATED/zgelsx.f zgelsy.f zgeql2.f zgeqlf.f zgeqp3.f
DEPRECATED/zgeqpf.f zgeqr2.f zgeqr2p.f zgeqrf.f zgeqrfp.f zgerfs.f zgerq2.f zgerqf.f
zgesc2.f zgesdd.f zgesvd.f zgesvx.f zgetc2.f
zgetri.f
zggbak.f zggbal.f zgges.f zggesx.f zggev.f zggevx.f zggglm.f
zgghrd.f zgglse.f zggqrf.f zggrqf.f
DEPRECATED/zggsvd.f DEPRECATED/zggsvp.f
zgesc2.f zgesdd.f zgesvd.f zgesvdx.f zgesvj.f zgesvx.f zgetc2.f
zgetri.f zgetrf2.f
zggbak.f zggbal.f zgges.f zgges3.f zggesx.f zggev.f zggev3.f zggevx.f zggglm.f
zgghd3.f zgghrd.f zgglse.f zggqrf.f zggrqf.f
DEPRECATED/zggsvd.f zggsvd3.f DEPRECATED/zggsvp.f zggsvp3.f
zgsvj0.f zgsvj1.f
zgtcon.f zgtrfs.f zgtsv.f zgtsvx.f zgttrf.f zgttrs.f zgtts2.f zhbev.f
zhbevd.f zhbevx.f zhbgst.f zhbgv.f zhbgvd.f zhbgvx.f zhbtrd.f
zhecon.f zheev.f zheevd.f zheevr.f zheevx.f zhegs2.f zhegst.f
@@ -287,28 +318,28 @@ set(ZLASRC
zlarfg.f zlarft.f zlarfgp.f
zlarfx.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
zlassq.f zlasyf.f zlasyf_rook.f
zlassq.f zlasyf.f zlasyf_rook.f zlasyf_aa.f
zlatbs.f zlatdf.f zlatps.f zlatrd.f zlatrs.f zlatrz.f DEPRECATED/zlatzm.f
zpbcon.f zpbequ.f zpbrfs.f zpbstf.f zpbsv.f
zpbsvx.f zpbtf2.f zpbtrf.f zpbtrs.f zpocon.f zpoequ.f zporfs.f
zposv.f zposvx.f zpotrs.f zpstrf.f zpstf2.f
zposv.f zposvx.f zpotrf2.f zpotrs.f zpstrf.f zpstf2.f
zppcon.f zppequ.f zpprfs.f zppsv.f zppsvx.f zpptrf.f zpptri.f zpptrs.f
zptcon.f zpteqr.f zptrfs.f zptsv.f zptsvx.f zpttrf.f zpttrs.f zptts2.f
zrot.f zspcon.f zsprfs.f zspsv.f
zspsvx.f zsptrf.f zsptri.f zsptrs.f zdrscl.f zstedc.f
zstegr.f zstein.f zsteqr.f
zsycon.f
zsycon.f zsysv_aa.f
zsyrfs.f zsysv.f zsysvx.f zsytf2.f zsytrf.f zsytri.f zsytri2.f zsytri2x.f
zsyswapr.f zsytrs.f zsytrs2.f zsyconv.f
zsyswapr.f zsytrs.f zsytrs_aa.f zsytrs2.f zsyconv.f
zsytf2_rook.f zsytrf_rook.f zsytrs_rook.f
zsytri_rook.f zsycon_rook.f zsysv_rook.f
ztbcon.f ztbrfs.f ztbtrs.f ztgevc.f ztgex2.f
ztgexc.f ztgsen.f ztgsja.f ztgsna.f ztgsy2.f ztgsyl.f ztpcon.f
ztprfs.f ztptri.f
ztptrs.f ztrcon.f ztrevc.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
ztrsyl.f ztrtrs.f DEPRECATED/ztzrqf.f ztzrzf.f zung2l.f
zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f
zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunm22.f zunml2.f
zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
zunmtr.f zupgtr.f
zupmtr.f izmax1.f dzsum1.f zstemr.f
@@ -320,6 +351,15 @@ set(ZLASRC
zunbdb5.f zunbdb6.f zuncsd.f zuncsd2by1.f
zgeqrt.f zgeqrt2.f zgeqrt3.f zgemqrt.f
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f zpotri.f
zgelq.f zgelqt.f zgelqt3.f zgemlq.f zgemlqt.f zgemqr.f zgeqr.f zgetsls.f
zlamswlq.f zlamtsqr.f zlaswlq.f zlatsqr.f ztplqt.f ztplqt2.f ztpmlqt.f
zhesv_aa.f zhetrf_aa.f zhetrs_aa.f zlahef_aa.f zsytf2_rk.f zlasyf_rk.f
zsytrf_aa.f zsytrf_rk.f zsytrs_3.f zsycon_3.f zsytri_3.f zsytri_3x.f zsysv_rk.f
zhetf2_rk.f zlahef_rk.f zhetrf_rk.f zhetrs_3.f zhecon_3.f zhetri_3.f
zhetri_3x.f zhesv_rk.f zhb2st_kernels.f zhbev_2stage.f zhbevd_2stage.f
zhbevx_2stage.f zheev_2stage.f zheevd_2stage.f zheevr_2stage.f
zheevx_2stage.f zhegv_2stage.f zhetrd_2stage.f zhetrd_hb2st.F zhetrd_he2hb.f
zlarfy.f
)
set(LA_REL_SRC ${ALLAUX})

File diff suppressed because it is too large Load Diff

9
cmake/openblas.pc.in Normal file
View File

@@ -0,0 +1,9 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@
URL: https://github.com/xianyi/OpenBLAS
Libs: -L${libdir} -lopenblas
Cflags: -I${includedir}

View File

@@ -77,7 +77,7 @@ if (CYGWIN)
set(NO_EXPRECISION 1)
endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix")
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Interix" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Android")
if (SMP)
set(EXTRALIB "${EXTRALIB} -lpthread")
endif ()

View File

@@ -4,7 +4,8 @@
## This is triggered by system.cmake and runs before any of the code is built.
## Creates config.h and Makefile.conf by first running the c_check perl script (which creates those files).
## Next it runs f_check and appends some fortran information to the files.
## Finally it runs getarch and getarch_2nd for even more environment information.
## Then it runs getarch and getarch_2nd for even more environment information.
## Finally it builds gen_config_h for use at build time to generate config.h.
# CMake vars set by this file:
# CORE
@@ -50,20 +51,20 @@ else()
set(TARGET_CONF "config.h")
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/c_check.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/c_check.cmake")
if (NOT NOFORTRAN)
include("${CMAKE_SOURCE_DIR}/cmake/f_check.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
endif ()
# compile getarch
set(GETARCH_SRC
${CMAKE_SOURCE_DIR}/getarch.c
${PROJECT_SOURCE_DIR}/getarch.c
${CPUIDEMO}
)
if (NOT MSVC)
list(APPEND GETARCH_SRC ${CMAKE_SOURCE_DIR}/cpuid.S)
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
endif ()
if (MSVC)
@@ -71,16 +72,26 @@ if (MSVC)
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
endif()
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
# disable WindowsStore strict CRT checks
set(GETARCH_FLAGS ${GETARCH_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
endif ()
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
endif ()
message(STATUS "Running getarch")
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
@@ -96,12 +107,18 @@ ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${CMAKE_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${CMAKE_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
)
if (NOT ${GETARCH2_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch_2nd failed ${GETARCH2_LOG}")
endif ()
endif ()
# use the cmake binary w/ the -E param to run a shell command in a cross-platform way
execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 0 OUTPUT_VARIABLE GETARCH2_MAKE_OUT)
@@ -111,3 +128,21 @@ execute_process(COMMAND ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} 1 OUTPUT_VARIABLE
file(APPEND ${TARGET_CONF} ${GETARCH2_CONF_OUT})
ParseGetArchVars(${GETARCH2_MAKE_OUT})
# compile get_config_h
set(GEN_CONFIG_H_DIR "${PROJECT_BINARY_DIR}/genconfig_h_build")
set(GEN_CONFIG_H_BIN "gen_config_h${CMAKE_EXECUTABLE_SUFFIX}")
set(GEN_CONFIG_H_FLAGS "-DVERSION=\"${OpenBLAS_VERSION}\"")
file(MAKE_DIRECTORY ${GEN_CONFIG_H_DIR})
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GEN_CONFIG_H_RESULT ${GEN_CONFIG_H_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/gen_config_h.c
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GEN_CONFIG_H_FLAGS} -I${PROJECT_SOURCE_DIR}
OUTPUT_VARIABLE GEN_CONFIG_H_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GEN_CONFIG_H_BIN}
)
if (NOT ${GEN_CONFIG_H_RESULT})
MESSAGE(FATAL_ERROR "Compiling gen_config_h failed ${GEN_CONFIG_H_LOG}")
endif ()
endif ()

View File

@@ -3,7 +3,7 @@
## Description: Ported from OpenBLAS/Makefile.system
##
set(NETLIB_LAPACK_DIR "${CMAKE_SOURCE_DIR}/lapack-netlib")
set(NETLIB_LAPACK_DIR "${PROJECT_SOURCE_DIR}/lapack-netlib")
# TODO: Makefile.system detects Darwin (mac) and switches to clang here -hpa
# http://stackoverflow.com/questions/714100/os-detecting-makefile
@@ -22,7 +22,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER")
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
endif ()
@@ -78,7 +78,7 @@ else ()
set(ONLY_CBLAS 0)
endif ()
include("${CMAKE_SOURCE_DIR}/cmake/prebuild.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (NOT DEFINED NUM_THREADS)
set(NUM_THREADS ${NUM_CORES})
@@ -124,17 +124,17 @@ set(OBJCOPY "${CROSS_SUFFIX}objcopy")
set(OBJCONV "${CROSS_SUFFIX}objconv")
# OS dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/os.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/os.cmake")
# Architecture dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/arch.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake")
# C Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/cc.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
if (NOT NOFORTRAN)
# Fortran Compiler dependent settings
include("${CMAKE_SOURCE_DIR}/cmake/fc.cmake")
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
endif ()
if (BINARY64)
@@ -247,10 +247,10 @@ if (NOT DEFINED SYMBOLSUFFIX)
set(SYMBOLSUFFIX "")
endif ()
set(KERNELDIR "${CMAKE_SOURCE_DIR}/kernel/${ARCH}")
set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}")
# TODO: nead to convert these Makefiles
# include ${CMAKE_SOURCE_DIR}/cmake/${ARCH}.cmake
# include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake
if (${CORE} STREQUAL "PPC440")
set(CCOMMON_OPT "${CCOMMON_OPT} -DALLOC_QALLOC")
@@ -312,6 +312,8 @@ endif ()
set(AWK awk)
set(SED sed)
set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
@@ -410,8 +412,8 @@ set(LIBDEFNAME "${LIBNAME}.${LIBSUFFIX}.def")
set(LIBEXPNAME "${LIBNAME}.${LIBSUFFIX}.exp")
set(LIBZIPNAME "${LIBNAME}.${LIBSUFFIX}.zip")
set(LIBS "${CMAKE_SOURCE_DIR}/${LIBNAME}")
set(LIBS_P "${CMAKE_SOURCE_DIR}/${LIBNAME_P}")
set(LIBS "${PROJECT_SOURCE_DIR}/${LIBNAME}")
set(LIBS_P "${PROJECT_SOURCE_DIR}/${LIBNAME_P}")
set(LIB_COMPONENTS BLAS)

View File

@@ -332,6 +332,13 @@ typedef int blasint;
#endif
#endif
#ifdef POWER8
#ifndef YIELDING
#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n");
#endif
#endif
/*
#ifdef PILEDRIVER
#ifndef YIELDING
@@ -397,6 +404,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_sparc.h"
#endif
#ifdef ARCH_MIPS
#include "common_mips.h"
#endif
#ifdef ARCH_MIPS64
#include "common_mips64.h"
#endif
@@ -409,7 +420,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_arm64.h"
#endif
#ifdef ARCH_ZARCH
#include "common_zarch.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
@@ -417,6 +436,7 @@ typedef char env_var_t[MAX_PATH];
typedef char* env_var_t;
#define readenv(p, n) ((p)=getenv(n))
#endif
#endif
#if !defined(RPCC_DEFINED) && !defined(OS_WINDOWS)
#ifdef _POSIX_MONOTONIC_CLOCK
@@ -541,8 +561,13 @@ static void __inline blas_lock(volatile BLASULONG *address){
#endif
#if defined(C_PGI) || defined(C_SUN)
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#if defined(__STDC_IEC_559_COMPLEX__)
#define CREAL(X) creal(X)
#define CIMAG(X) cimag(X)
#else
#define CREAL(X) (*((FLOAT *)&X + 0))
#define CIMAG(X) (*((FLOAT *)&X + 1))
#endif
#else
#ifdef OPENBLAS_COMPLEX_STRUCT
#define CREAL(Z) ((Z).real)
@@ -615,9 +640,14 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
#ifdef USE_OPENMP
#ifndef C_MSVC
int omp_in_parallel(void);
int omp_get_num_procs(void);
#else
__declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));
int omp_get_num_procs(void) __attribute__ ((weak));
@@ -629,7 +659,11 @@ static __inline void blas_unlock(volatile BLASULONG *address){
*address = 0;
}
#ifdef OS_WINDOWSSTORE
static __inline int readenv_atoi(char *env) {
return 0;
}
#else
#ifdef OS_WINDOWS
static __inline int readenv_atoi(char *env) {
env_var_t p;
@@ -644,7 +678,7 @@ static __inline int readenv_atoi(char *env) {
return(0);
}
#endif
#endif
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)

View File

@@ -105,7 +105,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
.func REALNAME ;\
REALNAME:
#define EPILOGUE

View File

@@ -39,7 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INLINE inline
#ifdef F_INTERFACE_FLANG
#define RETURN_BY_STACK
#else
#define RETURN_BY_COMPLEX
#endif
#ifndef ASSEMBLER

View File

@@ -70,7 +70,7 @@ extern long int syscall (long int __sysno, ...);
static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (__LSB_VERSION__)
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
@@ -90,7 +90,7 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
#if defined (__LSB_VERSION__)
#if defined (__LSB_VERSION__) || defined(ARCH_ZARCH)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;

View File

@@ -2193,7 +2193,7 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sgemm_p;

103
common_mips.h Normal file
View File

@@ -0,0 +1,103 @@
/*****************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_MIPS
#define COMMON_MIPS
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
static inline unsigned int rpcc(void){
unsigned long ret;
__asm__ __volatile__(".set push \n"
"rdhwr %0, $30 \n"
".set pop" : "=r"(ret) : : "memory");
return ret;
}
#define RPCC_DEFINED
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#define GET_IMAGE(res)
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.arm ;\
.global REALNAME ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#define BUFFER_SIZE (16 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@@ -71,38 +71,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef COMMON_MIPS64
#define COMMON_MIPS64
#define MB
#define WMB
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static void INLINE blas_lock(volatile unsigned long *address){
long int ret, val = 1;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"1: ll %0, %3\n"
" ori %2, %0, 1\n"
" sc %2, %1\n"
" beqz %2, 1b\n"
" andi %2, %0, 1\n"
" sync\n"
: "=&r" (val), "=m" (address), "=&r" (ret)
: "m" (address)
: "memory");
} while (ret);
}
#define BLAS_LOCK_DEFINED
static inline unsigned int rpcc(void){
unsigned long ret;
#if defined(LOONGSON3A) || defined(LOONGSON3B)
// unsigned long long tmp;
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
@@ -111,17 +89,10 @@ static inline unsigned int rpcc(void){
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
#else
__asm__ __volatile__(".set push \n"
".set mips32r2\n"
"rdhwr %0, $30 \n"
".set pop" : "=r"(ret) : : "memory");
#endif
return ret;
}
#define RPCC_DEFINED
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#ifndef NO_AFFINITY
#define WHEREAMI
static inline int WhereAmI(void){
@@ -134,7 +105,6 @@ static inline int WhereAmI(void){
}
#endif
#endif
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;

View File

@@ -39,8 +39,13 @@
#ifndef COMMON_POWER
#define COMMON_POWER
#if defined(POWER8)
#define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory")
#else
#define MB __asm__ __volatile__ ("sync")
#define WMB __asm__ __volatile__ ("sync")
#endif
#define INLINE inline
@@ -798,7 +803,7 @@ Lmcount$lazy_ptr:
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 32 << 20)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@@ -245,6 +245,10 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define RETURN_BY_STACK
#endif
#ifdef F_INTERFACE_FLANG
#define RETURN_BY_STACK
#endif
#ifdef F_INTERFACE_PGI
#define RETURN_BY_STACK
#endif

140
common_zarch.h Normal file
View File

@@ -0,0 +1,140 @@
/*****************************************************************************
Copyright (c) 2011-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#ifndef COMMON_ZARCH
#define COMMON_ZARCH
#define MB
//__asm__ __volatile__ ("dmb ish" : : : "memory")
#define WMB
//__asm__ __volatile__ ("dmb ishst" : : : "memory")
#define INLINE inline
#define RETURN_BY_COMPLEX
#ifndef ASSEMBLER
/*
static void __inline blas_lock(volatile BLASULONG *address){
BLASULONG ret;
do {
while (*address) {YIELDING;};
__asm__ __volatile__(
"mov x4, #1 \n\t"
"1: \n\t"
"ldaxr x2, [%1] \n\t"
"cbnz x2, 1b \n\t"
"2: \n\t"
"stxr w3, x4, [%1] \n\t"
"cbnz w3, 1b \n\t"
"mov %0, #0 \n\t"
: "=r"(ret), "=r"(address)
: "1"(address)
: "memory", "x2" , "x3", "x4"
);
} while (ret);
}
*/
//#define BLAS_LOCK_DEFINED
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#if defined(DOUBLE)
#define GET_IMAGE(res) __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 256 ;\
.global REALNAME ;\
.type REALNAME, %function ;\
REALNAME:
#define EPILOGUE
#define PROFCODE
#endif
#define SEEK_ADDRESS
#ifndef PAGESIZE
#define PAGESIZE ( 4 << 10)
#endif
#define HUGE_PAGESIZE ( 4 << 20)
#if defined(CORTEXA57)
#define BUFFER_SIZE (20 << 20)
#else
#define BUFFER_SIZE (16 << 20)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@@ -114,6 +114,7 @@
#define CORE_HASWELL 24
#define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26
#define CORE_ZEN 27
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -209,5 +210,6 @@ typedef struct {
#define CPUTYPE_HASWELL 48
#define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51
#endif

View File

@@ -74,7 +74,7 @@ int get_feature(char *search)
fclose(infile);
if( p == NULL ) return;
if( p == NULL ) return 0;
t = strtok(p," ");
while( t = strtok(NULL," "))

View File

@@ -30,17 +30,26 @@
#define CPU_UNKNOWN 0
#define CPU_ARMV8 1
#define CPU_CORTEXA57 2
#define CPU_VULCAN 3
#define CPU_THUNDERX 4
#define CPU_THUNDERX2T99 5
static char *cpuname[] = {
"UNKNOWN",
"ARMV8" ,
"CORTEXA57"
"CORTEXA57",
"VULCAN",
"THUNDERX",
"THUNDERX2T99"
};
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"cortexa57"
"cortexa57",
"vulcan",
"thunderx",
"thunderx2t99"
};
int get_feature(char *search)
@@ -85,25 +94,34 @@ int detect(void)
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *) NULL ;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) {
if ((cpu_part != NULL) && (cpu_implementer != NULL)) {
break;
}
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) {
cpu_part = strchr(buffer, ':') + 2;
cpu_part = strdup(cpu_part);
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) {
cpu_implementer = strchr(buffer, ':') + 2;
cpu_implementer = strdup(cpu_implementer);
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "0xd07")) {
return CPU_CORTEXA57;
}
if(cpu_part != NULL && cpu_implementer != NULL) {
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41"))
return CPU_CORTEXA57;
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42"))
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
return CPU_THUNDERX2T99;
}
p = (char *) NULL ;
@@ -176,6 +194,28 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 4\n");
break;
case CPU_VULCAN:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
@@ -191,8 +231,42 @@ void get_cpuconfig(void)
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
case CPU_THUNDERX:
printf("#define ARMV8\n");
printf("#define THUNDERX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 128\n");
printf("#define L2_SIZE 16777216\n");
printf("#define L2_LINESIZE 128\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
case CPU_THUNDERX2T99:
printf("#define VULCAN \n");
printf("#define HAVE_VFP \n");
printf("#define HAVE_VFPV3 \n");
printf("#define HAVE_NEON \n");
printf("#define HAVE_VFPV4 \n");
printf("#define L1_CODE_SIZE 32768 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 8 \n");
printf("#define L1_DATA_SIZE 32768 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 8 \n");
printf("#define L2_SIZE 262144 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define L3_SIZE 33554432 \n");
printf("#define L3_LINESIZE 64 \n");
printf("#define L3_ASSOCIATIVE 32 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}

View File

@@ -71,15 +71,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
#define CPU_P5600 1
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B"
"P5600"
};
int detect(void){
@@ -120,7 +116,7 @@ int detect(void){
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_SICORTEX;
return CPU_UNKNOWN;
}
}
//Check model name for Loongson3
@@ -149,64 +145,40 @@ char *get_corename(void){
}
void get_architecture(void){
printf("MIPS64");
printf("MIPS");
}
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
if(detect()==CPU_P5600){
printf("P5600");
}else{
printf("SICORTEX");
printf("UNKNOWN");
}
}
void get_subdirname(void){
printf("mips64");
printf("mips");
}
void get_cpuconfig(void){
if(detect()==CPU_LOONGSON3A) {
printf("#define LOONGSON3A\n");
if(detect()==CPU_P5600){
printf("#define P5600\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 32\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else{
printf("#define UNKNOWN\n");
}
}
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
if(detect()==CPU_P5600) {
printf("p5600\n");
}else{
#ifdef __mips64
printf("mips64\n");
#else
printf("mips32\n");
#endif
printf("mips\n");
}
}

238
cpuid_mips64.c Normal file
View File

@@ -0,0 +1,238 @@
/*****************************************************************************
Copyright (c) 2011-2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
#define CPU_I6400 4
#define CPU_P6600 5
static char *cpuname[] = {
"UNKOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B",
"I6400",
"P6600"
};
int detect(void){
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_SICORTEX;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif
return CPU_UNKNOWN;
}
char *get_corename(void){
return cpuname[detect()];
}
void get_architecture(void){
printf("MIPS64");
}
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
}else if(detect()==CPU_I6400){
printf("I6400");
}else if(detect()==CPU_P6600){
printf("P6600");
}else{
printf("SICORTEX");
}
}
void get_subdirname(void){
printf("mips64");
}
void get_cpuconfig(void){
if(detect()==CPU_LOONGSON3A) {
printf("#define LOONGSON3A\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_I6400){
printf("#define I6400\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else if(detect()==CPU_P6600){
printf("#define P6600\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}else{
printf("#define SICORTEX\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
printf("#define L2_LINESIZE 32\n");
printf("#define DTB_DEFAULT_ENTRIES 32\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}
}
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
}else if(detect()==CPU_I6400) {
printf("i6400\n");
}else if(detect()==CPU_P6600) {
printf("p6600\n");
}else{
printf("mips64\n");
}
}

View File

@@ -636,6 +636,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
LD1.associative = 8;
LD1.linesize = 64;
break;
case 0x63 :
DTB.size = 2048;
DTB.associative = 4;
DTB.linesize = 32;
LDTB.size = 4096;
LDTB.associative= 4;
LDTB.linesize = 32;
case 0x66 :
LD1.size = 8;
LD1.associative = 4;
@@ -667,6 +674,13 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
LC1.size = 64;
LC1.associative = 8;
break;
case 0x76 :
ITB.size = 2048;
ITB.associative = 0;
ITB.linesize = 8;
LITB.size = 4096;
LITB.associative= 0;
LITB.linesize = 8;
case 0x77 :
LC1.size = 16;
LC1.associative = 4;
@@ -1110,6 +1124,9 @@ int get_cpuname(void){
break;
case 3:
switch (model) {
case 7:
// Bay Trail
return CPUTYPE_ATOM;
case 10:
case 14:
// Ivy Bridge
@@ -1172,6 +1189,8 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Braswell
case 13:
// Avoton
return CPUTYPE_NEHALEM;
@@ -1200,8 +1219,35 @@ int get_cpuname(void){
#endif
else
return CPUTYPE_NEHALEM;
case 7:
// Xeon Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
return CPUTYPE_NEHALEM;
}
break;
case 9:
case 8:
switch (model) {
case 14: // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@@ -1233,8 +1279,11 @@ int get_cpuname(void){
return CPUTYPE_OPTERON;
case 1:
case 3:
case 7:
case 10:
return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCAT;
case 6:
switch (model) {
case 1:
@@ -1249,7 +1298,13 @@ int get_cpuname(void){
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 5: // New EXCAVATOR CPUS
if(support_avx())
return CPUTYPE_EXCAVATOR;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 0:
case 8:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
@@ -1271,8 +1326,19 @@ int get_cpuname(void){
break;
}
break;
case 5:
return CPUTYPE_BOBCAT;
case 8:
switch (model) {
case 1:
// AMD Ryzen
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_ZEN;
#else
return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CPUTYPE_BARCELONA;
}
}
break;
}
@@ -1399,6 +1465,7 @@ static char *cpuname[] = {
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
};
static char *lowercpuname[] = {
@@ -1452,6 +1519,7 @@ static char *lowercpuname[] = {
"haswell",
"steamroller",
"excavator",
"zen",
};
static char *corename[] = {
@@ -1482,6 +1550,7 @@ static char *corename[] = {
"HASWELL",
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
};
static char *corename_lower[] = {
@@ -1512,6 +1581,7 @@ static char *corename_lower[] = {
"haswell",
"steamroller",
"excavator",
"zen",
};
@@ -1678,6 +1748,8 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 12:
// Braswell
case 13:
// Avoton
return CORE_NEHALEM;
@@ -1706,8 +1778,33 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
}
case 7:
// Phi Knights Landing
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
case 12:
// Apollo Lake
return CORE_NEHALEM;
}
break;
case 9:
case 8:
if (model == 14) { // Kaby Lake
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
}
}
break;
@@ -1737,8 +1834,13 @@ int get_coretype(void){
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 5: // New EXCAVATOR
if(support_avx())
return CORE_EXCAVATOR;
else
return CORE_BARCELONA; //OS don't support AVX.
case 0:
case 8:
switch(exmodel){
case 1: //AMD Trinity
if(support_avx())
@@ -1760,9 +1862,22 @@ int get_coretype(void){
}
break;
}
}else return CORE_BARCELONA;
} else if (exfamily == 8) {
switch (model) {
case 1:
// AMD Ryzen
if(support_avx())
#ifndef NO_AVX2
return CORE_ZEN;
#else
return CORE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator
#endif
else
return CORE_BARCELONA;
}
} else {
return CORE_BARCELONA;
}
}
}

111
cpuid_zarch.c Normal file
View File

@@ -0,0 +1,111 @@
/**************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <string.h>
#define CPU_GENERIC 0
#define CPU_Z13 1
static char *cpuname[] = {
"ZARCH_GENERIC",
"Z13"
};
static char *cpuname_lower[] = {
"zarch_generic",
"z13"
};
int detect(void)
{
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/sysinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("Type", buffer, 4)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;
return CPU_GENERIC;
}
void get_libname(void)
{
int d = detect();
printf("%s", cpuname_lower[d]);
}
char *get_corename(void)
{
return cpuname[detect()];
}
void get_architecture(void)
{
printf("ZARCH");
}
void get_subarchitecture(void)
{
int d = detect();
printf("%s", cpuname[d]);
}
void get_subdirname(void)
{
printf("zarch");
}
void get_cpuconfig(void)
{
int d = detect();
switch (d){
case CPU_GENERIC:
printf("#define ZARCH_GENERIC\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
case CPU_Z13:
printf("#define Z13\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
break;
}
}

View File

@@ -105,12 +105,16 @@ ARCH_X86_64
ARCH_POWER
#endif
#if defined(__s390x__) || defined(__zarch__)
ARCH_ZARCH
#endif
#ifdef __mips64
ARCH_MIPS64
#endif
#if defined(__mips32) || defined(__mips)
ARCH_MIPS32
ARCH_MIPS
#endif
#ifdef __alpha

View File

@@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
enable_language(Fortran)

View File

@@ -42,6 +42,7 @@ ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
all :: all1 all2 all3
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat1
OMP_NUM_THREADS=2 ./xdcblat1
@@ -53,8 +54,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat1
OPENBLAS_NUM_THREADS=2 ./xzcblat1
endif
endif
all2: xscblat2 xdcblat2 xccblat2 xzcblat2
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat2 < sin2
OMP_NUM_THREADS=2 ./xdcblat2 < din2
@@ -66,8 +69,10 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
endif
endif
all3: xscblat3 xdcblat3 xccblat3 xzcblat3
ifndef CROSS
ifeq ($(USE_OPENMP), 1)
OMP_NUM_THREADS=2 ./xscblat3 < sin3
OMP_NUM_THREADS=2 ./xdcblat3 < din3
@@ -88,6 +93,7 @@ else
OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
endif
endif

View File

@@ -1,5 +1,5 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
# sources that need to be compiled twice, once with no flags and once with LOWER
set(UL_SOURCES

View File

@@ -177,7 +177,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG ku, BLASLONG kl, FLOAT *alpha, FLOAT
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@@ -177,7 +177,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#endif
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
blas_queue_t queue[MAX_CPU_NUMBER + 1];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];

View File

@@ -182,7 +182,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *y,
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@@ -221,7 +221,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@@ -243,7 +243,7 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *x, BLASLONG incx, FLOAT *buffer, int nthr
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@@ -281,7 +281,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_m[MAX_CPU_NUMBER + 1];
BLASLONG range_n[MAX_CPU_NUMBER];
BLASLONG range_n[MAX_CPU_NUMBER + 1];
BLASLONG width, i, num_cpu;

View File

@@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
# N.B. In the original makefile there was a BLOCKS define used in the compilation of these files but I don't see any evidence of it being set anywhere. -hpa

View File

@@ -316,7 +316,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_l > GEMM3M_Q) {
min_l = (min_l + 1) / 2;
#ifdef UNROLL_X
min_l = (min_l + UNROLL_X - 1) & ~(UNROLL_X - 1);
min_l = ((min_l + UNROLL_X - 1)/UNROLL_X) * UNROLL_X;
#endif
}
}
@@ -326,7 +326,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@@ -365,7 +365,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@@ -386,7 +386,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@@ -429,7 +429,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@@ -451,7 +451,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@@ -494,7 +494,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();

View File

@@ -297,9 +297,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_l = GEMM_Q;
} else {
if (min_l > GEMM_Q) {
min_l = (min_l / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1));
gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
}
@@ -311,7 +311,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
} else {
l1stride = 0;
}
@@ -369,7 +369,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();

View File

@@ -365,7 +365,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM3M_Q * ((div_n + GEMM3M_UNROLL_N - 1) & ~(GEMM3M_UNROLL_N - 1));
buffer[i] = buffer[i - 1] + GEMM3M_Q * (((div_n + GEMM3M_UNROLL_N - 1)/GEMM3M_UNROLL_N) * GEMM3M_UNROLL_N);
}
for(ls = 0; ls < k; ls += min_l){
@@ -384,7 +384,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else {
if (min_i > GEMM3M_P) {
min_i = (min_i / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
}
@@ -482,7 +482,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@@ -618,7 +618,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();
@@ -754,7 +754,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM3M_P;
} else
if (min_i > GEMM3M_P) {
min_i = ((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1) & ~(GEMM3M_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM3M_UNROLL_M - 1)/GEMM3M_UNROLL_M) * GEMM3M_UNROLL_M;
}
START_RPCC();

View File

@@ -189,7 +189,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
@@ -230,7 +230,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa);
@@ -245,7 +245,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
if (m_start >= js) {
@@ -284,7 +284,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
ICOPY_OPERATION(min_l, min_i, b, ldb, ls, is, sa);
@@ -322,7 +322,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;
@@ -353,7 +353,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (m_start - js) * COMPSIZE;
@@ -383,7 +383,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;

View File

@@ -198,7 +198,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
#ifndef LOWER
@@ -239,7 +239,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
aa = sb + min_l * (is - js) * COMPSIZE;
@@ -303,7 +303,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();
@@ -375,7 +375,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
if (is < js + min_j) {
@@ -460,7 +460,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();

View File

@@ -210,8 +210,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld\n", mypos, m_from, m_to, n_from, n_to);
#endif
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
@@ -233,7 +232,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
}
@@ -253,8 +252,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
STOP_RPCC(copy_A);
div_n = ((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((m_to - m_from + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = m_from, bufferside = 0; xxx < m_to; xxx += div_n, bufferside ++) {
@@ -353,9 +351,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
while (current >= 0) {
#endif
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
START_RPCC();
@@ -412,7 +409,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
}
START_RPCC();
@@ -425,8 +422,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
do {
div_n = ((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE
+ GEMM_UNROLL_MN - 1) & ~(GEMM_UNROLL_MN - 1);
div_n = (((range_n[current + 1] - range_n[current] + DIVIDE_RATE - 1) / DIVIDE_RATE + GEMM_UNROLL_MN - 1)/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
for (xxx = range_n[current], bufferside = 0; xxx < range_n[current + 1]; xxx += div_n, bufferside ++) {
@@ -602,9 +598,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
double di = (double)i;
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1) );
if (num_cpu == 0) width = n - ((n - width) & ~mask);
if (num_cpu == 0) width = n - (((n - width)/(mask+1)) * (mask+1) );
if ((width > n - i) || (width < mask)) width = n - i;
@@ -644,7 +640,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
double di = (double)i;
width = (((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask);
width = (((BLASLONG)((sqrt(di * di + dnum) - di) + mask)/(mask+1)) * (mask+1));
if ((width > n - i) || (width < mask)) width = n - i;

View File

@@ -310,7 +310,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
buffer[0] = sb;
for (i = 1; i < DIVIDE_RATE; i++) {
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1) & ~(GEMM_UNROLL_N - 1)) * COMPSIZE;
buffer[i] = buffer[i - 1] + GEMM_Q * ((div_n + GEMM_UNROLL_N - 1)/GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE;
}
@@ -331,7 +331,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else {
if (min_i > GEMM_P) {
min_i = (min_i / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
} else {
if (args -> nthreads == 1) l1stride = 0;
}
@@ -443,7 +443,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
min_i = GEMM_P;
} else
if (min_i > GEMM_P) {
min_i = ((min_i + 1) / 2 + GEMM_UNROLL_M - 1) & ~(GEMM_UNROLL_M - 1);
min_i = (((min_i + 1) / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M;
}
START_RPCC();

View File

@@ -158,7 +158,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@@ -109,7 +109,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)i;
width = ((BLASLONG)( sqrt(di * di + dnum) - di) + mask) & ~mask;
width = (BLASLONG)(( sqrt(di * di + dnum) - di + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;
@@ -149,7 +149,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
if (nthreads - num_cpu > 1) {
di = (double)(arg -> n - i);
width = ((BLASLONG)(-sqrt(di * di + dnum) + di) + mask) & ~mask;
width = ((BLASLONG)((-sqrt(di * di + dnum) + di) + mask)/(mask+1)) * (mask+1);
if ((width <= 0) || (width > n_to - i)) width = n_to - i;

View File

@@ -149,7 +149,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r, FLOAT alpha_i,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@@ -132,7 +132,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha_r,
int mm, nn;
mm = (loop & ~(GEMM_UNROLL_MN - 1));
mm = (loop/GEMM_UNROLL_MN) * GEMM_UNROLL_MN;
nn = MIN(GEMM_UNROLL_MN, n - loop);
#ifndef LOWER

View File

@@ -1,4 +1,4 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
if (${CORE} STREQUAL "PPC440")
set(MEMORY memory_qalloc.c)
@@ -12,6 +12,8 @@ if (SMP)
set(BLAS_SERVER blas_server_omp.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
set(BLAS_SERVER blas_server_win32.c)
elseif (${CMAKE_SYSTEM_NAME} STREQUAL "WindowsStore")
set(BLAS_SERVER blas_server_win32.c)
endif ()
if (NOT DEFINED BLAS_SERVER)

View File

@@ -110,3 +110,74 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
return 0;
}
int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda,
void *b, BLASLONG ldb,
void *c, BLASLONG ldc, int (*function)(), int nthreads){
blas_queue_t queue[MAX_CPU_NUMBER];
blas_arg_t args [MAX_CPU_NUMBER];
BLASLONG i, width, astride, bstride;
int num_cpu, calc_type;
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2;
mode |= BLAS_LEGACY;
for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
num_cpu = 0;
i = m;
while (i > 0){
/* Adjust Parameters */
width = blas_quickdivide(i + nthreads - num_cpu - 1,
nthreads - num_cpu);
i -= width;
if (i < 0) width = width + i;
astride = width * lda;
if (!(mode & BLAS_TRANSB_T)) {
bstride = width * ldb;
} else {
bstride = width;
}
astride <<= calc_type;
bstride <<= calc_type;
args[num_cpu].m = width;
args[num_cpu].n = n;
args[num_cpu].k = k;
args[num_cpu].a = (void *)a;
args[num_cpu].b = (void *)b;
args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2);
args[num_cpu].lda = lda;
args[num_cpu].ldb = ldb;
args[num_cpu].ldc = ldc;
args[num_cpu].alpha = alpha;
queue[num_cpu].mode = mode;
queue[num_cpu].routine = function;
queue[num_cpu].args = &args[num_cpu];
queue[num_cpu].next = &queue[num_cpu + 1];
a = (void *)((BLASULONG)a + astride);
b = (void *)((BLASULONG)b + bstride);
num_cpu ++;
}
if (num_cpu) {
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
}
return 0;
}

View File

@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>
@@ -276,6 +276,9 @@ static void* blas_thread_server(void *arg){
unsigned int last_tick;
void *buffer, *sa, *sb;
blas_queue_t *queue;
blas_queue_t *tscq;
#ifdef TIMING_DEBUG
unsigned long start, stop;
#endif
@@ -309,8 +312,11 @@ static void* blas_thread_server(void *arg){
last_tick = (unsigned int)rpcc();
while (!thread_status[cpu].queue) {
pthread_mutex_lock (&thread_status[cpu].lock);
tscq=thread_status[cpu].queue;
pthread_mutex_unlock (&thread_status[cpu].lock);
while(!tscq) {
YIELDING;
if ((unsigned int)rpcc() - last_tick > thread_timeout) {
@@ -333,6 +339,9 @@ static void* blas_thread_server(void *arg){
last_tick = (unsigned int)rpcc();
}
pthread_mutex_lock (&thread_status[cpu].lock);
tscq=thread_status[cpu].queue;
pthread_mutex_unlock (&thread_status[cpu].lock);
}
@@ -351,7 +360,9 @@ static void* blas_thread_server(void *arg){
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
pthread_mutex_lock (&thread_status[cpu].lock);
thread_status[cpu].queue = (blas_queue_t *)1;
pthread_mutex_unlock (&thread_status[cpu].lock);
sa = queue -> sa;
sb = queue -> sb;
@@ -433,7 +444,10 @@ static void* blas_thread_server(void *arg){
// thread is marked as done and other threads use them
WMB;
pthread_mutex_lock (&thread_status[cpu].lock);
thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0); /* Need a trick */
pthread_mutex_unlock (&thread_status[cpu].lock);
WMB;
}
@@ -613,6 +627,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#endif
BLASLONG i = 0;
blas_queue_t *current = queue;
blas_queue_t *tsiq,*tspq;
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
int node = get_node();
int nodes = get_num_nodes();
@@ -660,15 +675,23 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
}
}
#else
while(thread_status[i].queue) {
pthread_mutex_lock (&thread_status[i].lock);
tsiq=thread_status[i].queue ;
pthread_mutex_unlock (&thread_status[i].lock);
while(tsiq) {
i ++;
if (i >= blas_num_threads - 1) i = 0;
pthread_mutex_lock (&thread_status[i].lock);
tsiq=thread_status[i].queue ;
pthread_mutex_unlock (&thread_status[i].lock);
}
#endif
queue -> assigned = i;
WMB;
pthread_mutex_lock (&thread_status[i].lock);
thread_status[i].queue = queue;
pthread_mutex_unlock (&thread_status[i].lock);
WMB;
queue = queue -> next;
@@ -689,11 +712,15 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
pos = current -> assigned;
if ((BLASULONG)thread_status[pos].queue > 1) {
pthread_mutex_lock (&thread_status[pos].lock);
tspq=thread_status[pos].queue;
pthread_mutex_unlock (&thread_status[pos].lock);
if ((BLASULONG)tspq > 1) {
pthread_mutex_lock (&thread_status[pos].lock);
if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
pthread_mutex_lock (&thread_status[pos].lock);
#ifdef MONITOR
num_suspend ++;
@@ -703,8 +730,9 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
thread_status[pos].status = THREAD_STATUS_WAKEUP;
pthread_cond_signal(&thread_status[pos].wakeup);
}
pthread_mutex_unlock(&thread_status[pos].lock);
}
pthread_mutex_unlock(&thread_status[pos].lock);
}
current = current -> next;
@@ -714,11 +742,22 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
}
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
blas_queue_t * tsqq;
while ((num > 0) && queue) {
while(thread_status[queue -> assigned].queue) {
pthread_mutex_lock(&thread_status[queue->assigned].lock);
tsqq=thread_status[queue -> assigned].queue;
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
while(tsqq) {
YIELDING;
pthread_mutex_lock(&thread_status[queue->assigned].lock);
tsqq=thread_status[queue -> assigned].queue;
pthread_mutex_unlock(&thread_status[queue->assigned].lock);
};
queue = queue -> next;

View File

@@ -443,8 +443,11 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
TerminateThread(blas_threads[i],0);
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
}
blas_server_avail = 0;

View File

@@ -70,8 +70,10 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#endif
#else
//Use NEHALEM kernels for sandy bridge
@@ -81,6 +83,7 @@ extern gotoblas_t gotoblas_HASWELL;
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
@@ -232,6 +235,7 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
@@ -261,9 +265,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Avoton
if (model == 13) {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
return &gotoblas_NEHALEM;
}
return NULL;
@@ -286,6 +289,30 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
return &gotoblas_NEHALEM;
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake
if(support_avx())
return &gotoblas_HASWELL;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
@@ -331,7 +358,14 @@ static gotoblas_t *get_coretype(void){
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0){
}else if(model == 5){
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0 || model == 8){
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
@@ -358,9 +392,16 @@ static gotoblas_t *get_coretype(void){
}
}
} else {
} else if (exfamily == 8) {
if (model == 1) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}else {
return &gotoblas_BARCELONA;
}
}
@@ -370,7 +411,6 @@ static gotoblas_t *get_coretype(void){
switch (family) {
case 0x6:
return &gotoblas_NANO;
break;
}
}
@@ -401,6 +441,7 @@ static char *corename[] = {
"Haswell",
"Steamroller",
"Excavator",
"Zen"
};
char *gotoblas_corename(void) {
@@ -427,6 +468,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
return corename[0];
}
@@ -439,7 +481,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 21; i++)
for ( i=1 ; i <= 23; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
@@ -457,6 +499,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);

View File

@@ -354,6 +354,24 @@ static int numa_check(void) {
return common -> num_nodes;
}
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 6)
int sched_getcpu(void)
{
int cpu;
FILE *fp = NULL;
if ( (fp = fopen("/proc/self/stat", "r")) == NULL)
return -1;
if ( fscanf( fp, "%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%*s%d", &cpu) != 1) {
fclose (fp);
return -1;
}
fclose (fp);
return(cpu);
}
#endif
#endif
static void numa_mapping(void) {
int node, cpu, core;
@@ -361,6 +379,9 @@ static void numa_mapping(void) {
unsigned long work, bit;
int count = 0;
int bitmask_idx = 0;
int current_cpu;
int current_node = 0;
int cpu_count = 0;
for (node = 0; node < common -> num_nodes; node ++) {
core = 0;
@@ -382,33 +403,84 @@ static void numa_mapping(void) {
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
#endif
h = 1;
current_cpu = sched_getcpu();
for (cpu = 0; cpu < count; cpu++) {
if (READ_CPU(common -> cpu_info[cpu]) == current_cpu) {
current_node = READ_NODE(common -> cpu_info[cpu]);
break;
}
}
for (i = 0; i < MAX_BITMASK_LEN; i++)
cpu_count += popcount(common -> node_info[current_node][i] & common -> avail[i]);
while (h < count) h = 2 * h + 1;
/*
* If all the processes can be accommodated in the
* in the current node itself, then bind to cores
* from the current node only
*/
if (numprocs <= cpu_count) {
/*
* First sort all the cores in order from the current node.
* Then take remaining nodes one by one in order,
* and sort their cores in order.
*/
for (i = 0; i < count; i++) {
for (j = 0; j < count - 1; j++) {
int node_1, node_2;
int core_1, core_2;
int swap = 0;
while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
node_1 = READ_NODE(common -> cpu_info[j]);
node_2 = READ_NODE(common -> cpu_info[j + 1]);
core_1 = READ_CORE(common -> cpu_info[j]);
core_2 = READ_CORE(common -> cpu_info[j + 1]);
if (node_1 == node_2) {
if (core_1 > core_2)
swap = 1;
} else {
if ((node_2 == current_node) ||
((node_1 != current_node) && (node_1 > node_2)))
swap = 1;
}
if (swap) {
unsigned long temp;
temp = common->cpu_info[j];
common->cpu_info[j] = common->cpu_info[j + 1];
common->cpu_info[j + 1] = temp;
}
}
}
} else {
h = 1;
while (h < count) h = 2 * h + 1;
while (h > 1) {
h /= 2;
for (i = h; i < count; i++) {
work = common -> cpu_info[i];
bit = CPU_ISSET(i, &cpu_orig_mask[0]);
j = i - h;
while (work < common -> cpu_info[j]) {
common -> cpu_info[j + h] = common -> cpu_info[j];
if (CPU_ISSET(j, &cpu_orig_mask[0])) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
j -= h;
if (j < 0) break;
}
common -> cpu_info[j + h] = work;
if (bit) {
CPU_SET(j + h, &cpu_orig_mask[0]);
} else {
CPU_CLR(j + h, &cpu_orig_mask[0]);
}
}
}
}
@@ -416,7 +488,10 @@ static void numa_mapping(void) {
fprintf(stderr, "\nSorting ...\n\n");
for (cpu = 0; cpu < count; cpu++)
fprintf(stderr, "CPU (%2d) : %08lx\n", cpu, common -> cpu_info[cpu]);
fprintf(stderr, "CPUINFO (%2d) : %08lx (CPU=%3lu CORE=%3lu NODE=%3lu)\n", cpu, common -> cpu_info[cpu],
READ_CPU(common -> cpu_info[cpu]),
READ_CORE(common -> cpu_info[cpu]),
READ_NODE(common -> cpu_info[cpu]));
#endif
}
@@ -751,13 +826,12 @@ void gotoblas_affinity_init(void) {
common -> shmid = pshmid;
if (common -> magic != SH_MAGIC) {
#ifdef DEBUG
fprintf(stderr, "Shared Memory Initialization.\n");
#endif
//returns the number of processors which are currently online
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;
common -> num_procs = sysconf(_SC_NPROCESSORS_CONF);;
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
@@ -773,7 +847,7 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
@@ -866,7 +940,7 @@ void gotoblas_set_affinity2(int threads) {};
void gotoblas_affinity_reschedule(void) {};
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_ONLN); }
int get_num_procs(void) { return sysconf(_SC_NPROCESSORS_CONF); }
int get_num_nodes(void) { return 1; }

View File

@@ -169,13 +169,13 @@ void goto_set_num_threads(int num_threads) {};
#else
#if defined(OS_LINUX) || defined(OS_SUNOS)
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
#ifndef NO_AFFINITY
int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
@@ -184,7 +184,7 @@ int get_num_procs(void) {
#ifdef OS_ANDROID
int get_num_procs(void) {
static int nums = 0;
if (!nums) nums = sysconf(_SC_NPROCESSORS_ONLN);
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
return nums;
}
#endif
@@ -381,6 +381,16 @@ static int release_pos = 0;
static int hot_alloc = 0;
#endif
/* Global lock for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
#endif
#ifdef ALLOC_MMAP
static void alloc_mmap_free(struct release_t *release){
@@ -390,6 +400,8 @@ static void alloc_mmap_free(struct release_t *release){
}
}
#ifdef NO_WARMUP
static void *alloc_mmap(void *address){
@@ -406,9 +418,11 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
LOCK_COMMAND(&alloc_lock);
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
UNLOCK_COMMAND(&alloc_lock);
}
#ifdef OS_LINUX
@@ -550,12 +564,14 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@@ -889,15 +905,6 @@ static void *alloc_hugetlbfile(void *address){
}
#endif
/* Global lock for memory allocation */
#if defined(USE_PTHREAD_LOCK)
static pthread_mutex_t alloc_lock = PTHREAD_MUTEX_INITIALIZER;
#elif defined(USE_PTHREAD_SPINLOCK)
static pthread_spinlock_t alloc_lock = 0;
#else
static BLASULONG alloc_lock = 0UL;
#endif
#ifdef SEEK_ADDRESS
static BLASULONG base_address = 0UL;
@@ -963,45 +970,41 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
#if defined(WHEREAMI) && !defined(USE_OPENMP)
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
for (position = 0; position < NUM_BUFFERS; position ++){
memory[position].addr = (void *)0;
memory[position].pos = -1;
memory[position].used = 0;
memory[position].lock = 0;
}
#endif
#ifdef DYNAMIC_ARCH
gotoblas_dynamic_init();
gotoblas_dynamic_init();
#endif
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
gotoblas_affinity_init();
gotoblas_affinity_init();
#endif
#ifdef SMP
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
#endif
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#ifndef DYNAMIC_ARCH
blas_set_parameter();
blas_set_parameter();
#endif
#endif
memory_initialized = 1;
}
memory_initialized = 1;
UNLOCK_COMMAND(&alloc_lock);
}
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf("Alloc Start ...\n");
@@ -1012,7 +1015,7 @@ void *blas_memory_alloc(int procpos){
mypos = WhereAmI();
position = mypos;
while (position > NUM_BUFFERS) position >>= 1;
while (position >= NUM_BUFFERS) position >>= 1;
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
@@ -1034,14 +1037,14 @@ void *blas_memory_alloc(int procpos){
position = 0;
do {
if (!memory[position].used) {
/* if (!memory[position].used) { */
blas_lock(&memory[position].lock);
if (!memory[position].used) goto allocation;
blas_unlock(&memory[position].lock);
}
/* } */
position ++;
@@ -1103,7 +1106,9 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
LOCK_COMMAND(&alloc_lock);
memory[position].addr = map_address;
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@@ -1157,9 +1162,10 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
LOCK_COMMAND(&alloc_lock);
while ((memory[position].addr != free_area)
&& (position < NUM_BUFFERS)) position++;
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
if (memory[position].addr != free_area) goto error;
@@ -1171,6 +1177,7 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
UNLOCK_COMMAND(&alloc_lock);
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@@ -1185,6 +1192,7 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
UNLOCK_COMMAND(&alloc_lock);
return;
}
@@ -1471,12 +1479,30 @@ static int on_process_term(void)
#else
#pragma comment(linker, "/INCLUDE:__tls_used")
#endif
#pragma data_seg(push, old_seg)
#ifdef _WIN64
#pragma const_seg(".CRT$XLB")
#else
#pragma data_seg(".CRT$XLB")
#endif
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#ifdef _WIN64
#pragma const_seg(".CRT$XTU")
#else
#pragma data_seg(".CRT$XTU")
#endif
static int(*p_process_term)(void) = on_process_term;
#pragma data_seg(pop, old_seg)
#ifdef _WIN64
#pragma const_seg()
#else
#pragma data_seg()
#endif
#endif
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
int size = 16;
#else
int size = get_L2_size();
@@ -497,13 +497,13 @@ void blas_set_parameter(void){
if (xgemm_p == 0) xgemm_p = 64;
#endif
sgemm_p = (sgemm_p + SGEMM_UNROLL_M - 1) & ~(SGEMM_UNROLL_M - 1);
dgemm_p = (dgemm_p + DGEMM_UNROLL_M - 1) & ~(DGEMM_UNROLL_M - 1);
cgemm_p = (cgemm_p + CGEMM_UNROLL_M - 1) & ~(CGEMM_UNROLL_M - 1);
zgemm_p = (zgemm_p + ZGEMM_UNROLL_M - 1) & ~(ZGEMM_UNROLL_M - 1);
sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
#ifdef QUAD_PRECISION
qgemm_p = (qgemm_p + QGEMM_UNROLL_M - 1) & ~(QGEMM_UNROLL_M - 1);
xgemm_p = (xgemm_p + XGEMM_UNROLL_M - 1) & ~(XGEMM_UNROLL_M - 1);
qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
#endif
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
@@ -727,3 +727,38 @@ void blas_set_parameter(void){
}
#endif
#if defined(ARCH_ARM64)
#if defined(VULCAN) || defined(THUNDERX2T99)
unsigned long dgemm_prefetch_size_a;
unsigned long dgemm_prefetch_size_b;
unsigned long dgemm_prefetch_size_c;
#endif
void blas_set_parameter(void)
{
#if defined(VULCAN) || defined(THUNDERX2T99)
dgemm_p = 160;
dgemm_q = 128;
dgemm_r = 4096;
sgemm_p = 128;
sgemm_q = 352;
sgemm_r = 4096;
cgemm_p = 128;
cgemm_q = 224;
cgemm_r = 4096;
zgemm_p = 128;
zgemm_q = 112;
zgemm_r = 4096;
dgemm_prefetch_size_a = 3584;
dgemm_prefetch_size_b = 512;
dgemm_prefetch_size_c = 128;
#endif
}
#endif

View File

@@ -46,10 +46,16 @@
#define printf _cprintf
#endif
#ifdef INTERFACE64
#define MSGFMT " ** On entry to %6s parameter number %2ld had an illegal value\n"
#else
#define MSGFMT " ** On entry to %6s parameter number %2d had an illegal value\n"
#endif
#ifdef __ELF__
int __xerbla(char *message, blasint *info, blasint length){
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
printf(MSGFMT,
message, *info);
return 0;
@@ -61,7 +67,7 @@ int BLASFUNC(xerbla)(char *, blasint *, blasint) __attribute__ ((weak, alias ("_
int BLASFUNC(xerbla)(char *message, blasint *info, blasint length){
printf(" ** On entry to %6s parameter number %2d had an illegal value\n",
printf(MSGFMT,
message, *info);
return 0;

View File

@@ -110,18 +110,24 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
endif
ifeq ($(NOFORTRAN), $(filter $(NOFORTRAN),1 2))
#only build without Fortran
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
$(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
../$(LIBSONAME) : ../$(LIBNAME) linktest.c
else
@@ -132,13 +138,13 @@ endif
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
#for LSB
env LSBCC_SHAREDLIBS=gfortran $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
rm -f linktest

61
exports/check_objs.sh Executable file
View File

@@ -0,0 +1,61 @@
#!/bin/bash
while read OBJ; do
if echo "$OBJ"|grep "_$" >/dev/null
then
[ "$OBJ" = "caxpyc_" ] && continue
[ "$OBJ" = "zaxpyc_" ] && continue
[ "$OBJ" = "blas_thread_shutdown_" ] && continue
O1=$(echo "$OBJ"|sed -e 's/_$//' )
if grep -w "$O1" exports/gensymbol >/dev/null
then
true
else
echo "$O1"
fi
continue
fi
if echo "$OBJ"|grep "^cblas" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
continue
fi
if echo "$OBJ"|grep "^LAPACKE" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
continue
fi
if echo "$OBJ"|grep "^lapack" >/dev/null
then
if grep -w "$OBJ" exports/gensymbol >/dev/null
then
true
else
echo "$OBJ"
fi
fi
done

File diff suppressed because it is too large Load Diff

35
f_check
View File

@@ -33,6 +33,7 @@ if ($compiler eq "") {
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77",
"flang",
"ifort");
OUTER:
@@ -78,8 +79,13 @@ if ($compiler eq "") {
$vendor = GFORTRAN;
$openmp = "-fopenmp";
} else {
$vendor = G77;
$openmp = "";
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} else {
$vendor = G77;
$openmp = "";
}
}
}
@@ -114,7 +120,7 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($data =~ /IBM/) {
if ($data =~ /IBM XL/) {
$vendor = IBM;
$openmp = "-openmp";
}
@@ -197,6 +203,12 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($compiler =~ /flang/) {
$vendor = FLANG;
$bu = "_";
$openmp = "-fopenmp";
}
if ($vendor eq "") {
$nofortran = 1;
$compiler = "gfortran";
@@ -223,7 +235,12 @@ if (!$?) {
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
$mips_data = `$compiler_bin -E -dM - < /dev/null`;
if ($mips_data =~ /_MIPS_ISA_MIPS64/) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
} else {
$link = `$compiler $openmp -mabi=32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
$binary = "" if ($?);
}
@@ -278,6 +295,12 @@ if ($link ne "") {
$linker_L .= "-Wl,". $flags . " ";
}
if ($flags =~ /^\--exclude-libs/) {
$linker_L .= "-Wl,". $flags . " ";
$flags="";
}
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
@@ -320,6 +343,10 @@ if ($vendor eq "INTEL"){
$linker_a .= "-lgfortran"
}
if ($vendor eq "FLANG"){
$linker_a .= "-lflang"
}
open(MAKEFILE, ">> $makefile") || die "Can't append $makefile";
open(CONFFILE, ">> $config" ) || die "Can't append $config";

36
gen_config_h.c Normal file
View File

@@ -0,0 +1,36 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char**argv) {
FILE *fp;
char line[100];
char line2[80];
char *s;
int i;
fprintf(stdout,"#ifndef OPENBLAS_CONFIG_H\n");
fprintf(stdout,"#define OPENBLAS_CONFIG_H\n");
fp=fopen(argv[1],"r");
do{
s=fgets(line,80,fp);
if (s== NULL) break;
memset(line2,0,80);
i=sscanf(line,"#define %70c",line2);
if (i!=0) {
fprintf(stdout,"#define OPENBLAS_%s",line2);
} else {
fprintf(stdout,"\n");
}
} while (1);
fclose(fp);
fprintf(stdout,"#define OPENBLAS_VERSION \"OpenBLAS %s\"\n", VERSION);
fp=fopen(argv[2],"r");
do{
s=fgets(line,100,fp);
if (s== NULL) break;
fprintf(stdout,"%s",line);
} while(1);
fclose(fp);
fprintf(stdout,"#endif /* OPENBLAS_CONFIG_H */\n");
exit(0);
}

131
getarch.c
View File

@@ -131,6 +131,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */
@@ -470,6 +473,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "EXCAVATOR"
#endif
#if defined (FORCE_ZEN)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "ZEN"
#define ARCHCONFIG "-DZEN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL2_CODE_ASSOCIATIVE=8 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=16777216 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=8 " \
"-DITB_DEFAULT_ENTRIES=64 -DITB_SIZE=4096 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
#define LIBNAME "zen"
#define CORENAME "ZEN"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
@@ -699,6 +721,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "I6400"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DI6400 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "i6400"
#define CORENAME "I6400"
#else
#endif
#ifdef FORCE_P6600
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "P6600"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DP6600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "p6600"
#define CORENAME "P6600"
#else
#endif
#ifdef FORCE_P5600
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "P5600"
#define SUBDIRNAME "mips"
#define ARCHCONFIG "-DP5600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "p5600"
#define CORENAME "P5600"
#else
#endif
#ifdef FORCE_ITANIUM2
#define FORCE
#define ARCHITECTURE "IA64"
@@ -839,7 +903,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBARCHITECTURE "CORTEXA57"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA57 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
@@ -852,6 +916,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_VULCAN
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "VULCAN"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DVULCAN " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "vulcan"
#define CORENAME "VULCAN"
#else
#endif
#ifdef FORCE_THUNDERX
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTHUNDERX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 "
#define LIBNAME "thunderx"
#define CORENAME "THUNDERX"
#else
#endif
#ifdef FORCE_THUNDERX2T99
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "THUNDERX2T99"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTHUNDERX2T99 " \
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "thunderx2t99"
#define CORENAME "THUNDERX2T99"
#else
#endif
#ifndef FORCE
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \
@@ -862,6 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#if defined(__zarch__) || defined(__s390x__)
#define ZARCH
#include "cpuid_zarch.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef INTEL_AMD
#include "cpuid_x86.c"
#define OPENBLAS_SUPPORTED
@@ -888,7 +1006,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef __mips__
#ifdef __mips64
#include "cpuid_mips64.c"
#else
#include "cpuid_mips.c"
#endif
#define OPENBLAS_SUPPORTED
#endif
@@ -922,7 +1044,7 @@ static int get_num_cores(void) {
#if defined(linux) || defined(__sun__)
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
return sysconf(_SC_NPROCESSORS_CONF);
#elif defined(OS_WINDOWS)
@@ -957,7 +1079,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
printf("CORE=%s\n", get_corename());
#endif
#endif
@@ -1049,6 +1171,7 @@ int main(int argc, char *argv[]){
p ++;
}
} else {
if (*p != '\n')
printf("%c", *p);
p ++;
}
@@ -1064,7 +1187,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@@ -1,5 +1,5 @@
include_directories(${CMAKE_SOURCE_DIR})
include_directories(${PROJECT_SOURCE_DIR})
set(BLAS1_SOURCES

Some files were not shown because too many files have changed in this diff Show More