Compare commits

...

228 Commits

Author SHA1 Message Date
Zhang Xianyi
835293cc1a Merge branch 'hotfix-v0.2.8' 2013-08-01 23:53:12 +08:00
Zhang Xianyi
b736aa8110 Update the doc for 0.2.8 version. 2013-08-01 23:52:43 +08:00
Zhang Xianyi
ae521ecc3e OpenBLAS 0.2.8 rc1. 2013-07-31 14:49:16 +08:00
Zhang Xianyi
36adfe8d64 Merge branch 'hotfix-v0.2.8' into develop 2013-07-31 14:46:56 +08:00
Zhang Xianyi
a07cc39571 Refs #266. Fixed the compiling bug with Open64 5.0. 2013-07-31 14:41:39 +08:00
Zhang Xianyi
b5c2ac4fd6 Fixed #264 the memory leak bug in dtrtri_U. 2013-07-29 23:21:10 +08:00
Zhang Xianyi
749f45ffc8 Fixed the FMA3 detection bug. 2013-07-29 16:48:53 +08:00
Zhang Xianyi
534c5ec919 Fixed #261. Use strncmp instead of a comparing trick. 2013-07-29 16:48:35 +08:00
Zhang Xianyi
bd2da90e13 Fixed typo in getarch_2nd.c. 2013-07-29 15:42:00 +08:00
Zhang Xianyi
5b504d6c23 Refs #263. Rollback bulldozer and piledriver kernels to barcelona kernels. 2013-07-28 17:39:24 +08:00
Zhang Xianyi
a2930664f4 Refs #262. Added executable stack markings. 2013-07-28 00:09:40 +08:00
Zhang Xianyi
6e0db36373 Merge branch 'sfabbro-ldflags' into develop 2013-07-27 23:03:07 +08:00
Zhang Xianyi
1e1250b703 Fixed #260. Fixed generating 32-bit shared library on previous commit. 2013-07-27 23:01:36 +08:00
Zhang Xianyi
23186d9f21 Fixed the FMA3 detection bug. 2013-07-27 22:37:57 +08:00
Zhang Xianyi
e6ebbfd314 Merge branch 'ldflags' of https://github.com/sfabbro/OpenBLAS into sfabbro-ldflags 2013-07-27 22:19:54 +08:00
Zhang Xianyi
4471c77905 Fixed #261. Use strncmp instead of a comparing trick. 2013-07-26 23:43:54 +08:00
Sebastien Fabbro
9f0fb6e662 Respect user's LDFLAGS 2013-07-25 14:08:37 -07:00
Zhang Xianyi
f26b7a08aa Merge branch 'develop' 2013-07-26 01:34:45 +08:00
Zhang Xianyi
63f14189e3 Refs #259. Fixed missing LAPACK functions in shared library. 2013-07-26 01:32:32 +08:00
Zhang Xianyi
e39384432b Merge branch 'develop' 2013-07-23 13:40:08 +08:00
Zhang Xianyi
c5437149c0 Merge pull request #257 from staticfloat/develop
Add in return value for `interface/trtri.c`
2013-07-22 22:35:29 -07:00
Elliot Saba
6f5b395009 Fix xianyi/OpenBLAS#256 2013-07-22 17:02:06 -07:00
Zhang Xianyi
d4f9571818 Refs #255. Didn't use f77 compiler. 2013-07-22 11:34:43 +08:00
Zhang Xianyi
937d838619 Update CONTRIBUTORS.md 2013-07-20 23:32:23 +08:00
Zhang Xianyi
a8f9b6a665 Merge branch 'develop' 2013-07-20 23:05:36 +08:00
Zhang Xianyi
6209c8fc44 Fixed #253. Update doc for v0.2.7 version. 2013-07-20 23:05:12 +08:00
Zhang Xianyi
238ceb4ac0 Merge branch 'loongson3b' into develop 2013-07-20 22:33:35 +08:00
Zhang Xianyi
77b572fa0b Merge branch 'loongson3a' into develop
Conflicts:
	Makefile.system
2013-07-20 22:33:17 +08:00
Zhang Xianyi
f69f89b846 Fixed #254. Added the date of changes in contributors file. 2013-07-20 11:35:27 +08:00
Zhang Xianyi
c77032b0cc create contributor file. 2013-07-19 08:38:03 +08:00
wangqian
1b3b9e841d Fixed a computational error in zgemm_kernel_4x4_sandy.S file. 2013-07-18 20:23:21 +08:00
Zhang Xianyi
b67252c2e4 Ensure the correct stack alignment on Win32. 2013-07-17 15:19:07 +08:00
Zhang Xianyi
c69e73b868 Fixed typo in generating shared library on x86_64. 2013-07-16 23:18:18 +08:00
Zhang Xianyi
b51e2ba1ee Modified Makefile to avoid redundant echo. 2013-07-16 22:44:27 +08:00
Zhang Xianyi
9c0a834f98 Modified Makefile.install 2013-07-16 17:45:00 +08:00
Zhang Xianyi
2a7503e563 Refs #225. Fixed a bug in GEMM OpenMP threading. 2013-07-15 09:56:19 +08:00
Zhang Xianyi
fd0c388681 Refs #191. A walk around for dtrtri_U single thread bug.
This function caused the failure of ERKALE serial test.
I replaced it with LAPACK source code.
2013-07-14 22:16:30 +08:00
Zhang Xianyi
61a9582987 Changed makefile for lapack. 2013-07-14 10:41:54 +08:00
Zhang Xianyi
b681064c6c Updated travis. 2013-07-12 21:41:12 +08:00
Zhang Xianyi
e80e285928 Update build matrix for Travis CI. 2013-07-11 23:49:29 +08:00
Zhang Xianyi
2ed0f6ab60 Fixed the typo. 2013-07-11 23:47:07 +08:00
Zhang Xianyi
5448643557 Fixed generating dll bug in last commit. 2013-07-11 22:24:50 +08:00
Zhang Xianyi
824c3c4df3 Fixed #251. Merge branch 'grisuthedragon-develop' into develop 2013-07-11 21:42:04 +08:00
grisuthedragon
c19a488af2 create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS.
2013-07-11 21:39:19 +08:00
Zhang Xianyi
32d2ca3035 Refs #214, #221, #246. Fixed the getrf overflow bug on Windows.
I used a smaller threshold since the stack size is 1MB on windows.
2013-07-11 03:20:02 +08:00
Zhang Xianyi
6df39ad9e7 Refs #248. Support LAPACK and LAPACKE with lsbcc.
For LAPACKE, use LAPACK_COMPLEX_STRUCTURE.
The reson is lsbcc didn't define complex I in complex.h.
2013-07-10 16:02:27 +08:00
Zhang Xianyi
3a96e4cbcb Merge pull request #249 from wernsaar/develop
replaced defined(DOUBLE) by !defined(XDOUBLE)
2013-07-10 01:01:03 -07:00
wernsaar
6f008abcef replaced defined(DOUBLE) by !defined(XDOUBLE) 2013-07-09 18:17:50 +02:00
Zhang Xianyi
3eb5af1955 Refs #247. Included lapack source codes. Avoid downloading tar.gz from netlib.org
Based on 3.4.2 version, apply patch.for_lapack-3.4.2.
2013-07-09 18:13:48 +08:00
Zhang Xianyi
fbb75e58b1 Fixed the typo in getarch.c 2013-07-09 16:26:59 +08:00
Zhang Xianyi
f54f5bac9e Refs #248. Fixed the LSB compatiable issue for BLAS only.
For example, make CC=lsbcc NO_LAPACK=1.
2013-07-09 15:38:03 +08:00
Zhang Xianyi
5d3312142a Refs #221 #246. Fixed the overflowing stack bug in mutlithreading BLAS3.
When NUM_THREADS(MAX_CPU_NUNBERS) is very large ,e.g. 256.

typedef struct {
  volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

job_t          job[MAX_CPU_NUMBER];

The job array is equal 8MB.

Thus, We use malloc instead of stack allocation.
2013-07-08 01:07:05 +08:00
Zhang Xianyi
886cbaf4e4 Support AMD Piledriver by bulldozer kernels. 2013-07-06 12:06:43 -03:00
Zhang Xianyi
0c4074e10b Added Travis CI status image. 2013-07-05 15:28:41 +08:00
Zhang Xianyi
cc522aa21d Use quiet make for Travis CI. 2013-07-05 14:52:57 +08:00
Zhang Xianyi
9c78fad721 Install gfortran in Travis CI. 2013-07-05 11:11:18 +08:00
Zhang Xianyi
6028232ad1 Added travis.yml file. 2013-07-04 23:30:53 +08:00
Zhang Xianyi
feb9a3889a Improved make clean on Mac OS X. 2013-07-02 14:37:30 +08:00
Zhang Xianyi
32dbeb636d Refs #221. Set stack limit to 16MB to prevent a SEGFAULT bug on Mac OS X with DYNAMIC_ARCH=1 & NUM_THREADS=256. 2013-07-02 14:17:55 +08:00
Zhang Xianyi
57944538b6 Use ALIGN_5 instead of .algin 32 in assembly kernel. Added ALIGN_5 for 32-bit OSX. 2013-07-01 16:09:05 +08:00
Zhang Xianyi
3ce2c62b0b Merge pull request #242 from danluu/readme.haswell
Update README to reflect Haswell support, etc.
2013-06-30 09:40:32 -07:00
Dan Luu
50464997a3 Fix miscellaneous typos 2013-06-30 11:36:13 -05:00
Zhang Xianyi
8e7cad1650 Fixed #217 openblas_config.h bug on Windows 64. 2013-07-01 00:35:14 +08:00
Dan Luu
590e6aeafc Add Haswell support 2013-06-30 11:35:00 -05:00
Dan Luu
88ef307cef Refs #241. Add Haswell support (using sandybridge optimizations) 2013-06-30 22:35:14 +08:00
Zhang Xianyi
6e8501c8a1 Fixed #239 bug in param.h about BARCELONA and BULLDOZER. 2013-06-29 10:36:01 +08:00
Zhang Xianyi
fa916a0fac Fixed #238 bug in lsame on x86. 2013-06-28 22:43:41 +08:00
Zhang Xianyi
fb298b34ae Merge pull request #235 from wernsaar/develop
Added ddot, daxpy, dcopy kernels for AMD bulldozer.
2013-06-21 17:59:26 -07:00
wernsaar
16012767f4 added dcopy_bulldozer.S 2013-06-21 16:06:51 +02:00
wernsaar
bcbac31b47 added ddot_bulldozer.S 2013-06-20 16:15:09 +02:00
wernsaar
8dc0c72583 added daxpy_bulldozer.S 2013-06-20 14:07:54 +02:00
wernsaar
89405a1a0b cleanup of dgemm_ncopy_8_bulldozer.S 2013-06-19 19:31:38 +02:00
wernsaar
4f2b12b8a8 added dgemv_t_bulldozer.S 2013-06-19 17:32:42 +02:00
Zhang Xianyi
646e168d26 Merge pull request #233 from wernsaar/develop
added dgemv_n and some faster gemm_copy routines to BULLDOZER.
2013-06-18 20:02:36 -07:00
wernsaar
93dbbe1fb8 added dgemm_ncopy_8_bulldozer.S 2013-06-18 13:29:23 +02:00
wernsaar
a135f5d9ed added gemm_tcopy_2_bulldozer.S 2013-06-18 11:01:33 +02:00
wernsaar
d0b6299b13 added dgemm_tcopy_8_bulldozer.S 2013-06-17 14:19:09 +02:00
wernsaar
9e58dd509e added gemm_ncopy_2_bulldozer.S 2013-06-17 12:55:12 +02:00
wernsaar
7c8227101b cleanup of dgemv_n_bulldozer.S and optimization of inner loop 2013-06-16 12:50:45 +02:00
wernsaar
f67fa62851 added dgemv_n_bulldozer.S 2013-06-15 16:42:37 +02:00
Zhang Xianyi
cd1d473ba0 Merge pull request #230 from wernsaar/develop
Refs #230. New dgemm and sgemm Kernel for BULLDOZER
2013-06-13 07:29:27 -07:00
Zhang Xianyi
56f160134d Refs #231. Change the default C compiler to clang on Mac OSX. 2013-06-13 22:15:19 +08:00
wernsaar
0ded1fcc1c performance optimizations in sgemm_kernel_16x2_bulldozer.S 2013-06-13 11:35:15 +02:00
wernsaar
a789b588cd added cgemm_kernel_4x2_bulldozer.S 2013-06-12 15:55:27 +02:00
wernsaar
8eaa04acbb added zgemm_kernel_2x2_bulldozer.S 2013-06-11 12:00:49 +02:00
wernsaar
d854b30ae6 Added UNROLL values for 3M to getarch_2nd.c, Makefile.system and Makefile.L3 2013-06-09 17:26:42 +02:00
wernsaar
d65bbec99b added new sgemm kernel for BULLDOZER 2013-06-09 15:57:42 +02:00
wernsaar
e4c39c7c26 changed stack touching 2013-06-08 10:43:08 +02:00
wernsaar
ba800f0883 correct GEMM_THREAD in param.h 2013-06-08 10:03:59 +02:00
wernsaar
25491e42f9 New dgemm kernel for BULLDOZER: dgemm_kernel_8x2_bulldozer.S 2013-06-08 09:40:17 +02:00
Zhang Xianyi
960b0c88a7 Refs #227. Detected LLVM/Clang compiler. 2013-06-06 23:43:40 +08:00
Zhang Xianyi
65ffead0cf Refs #124. Check XSAVE flag on x86 CPU. 2013-06-06 22:50:43 +08:00
Zhang Xianyi
f2fb8c7035 Change LIBSUFFIX from .lib to .a on windows. 2013-06-04 16:05:28 +08:00
Zhang Xianyi
9f59f384d8 Refs #223. Fixed s/dgemv bug on windows. 2013-06-04 16:01:05 +08:00
wangqian
23965f164c Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86_64. 2013-05-29 19:48:31 +08:00
wangqian
6a72840945 Fixed overflow internal buffer bug of (s/d/c/z)gemv on x86. 2013-05-29 13:23:12 +08:00
Zhang Xianyi
947457fb7c Fixed the bug about testing the exist of lapack tar package. 2013-05-24 15:52:35 +08:00
Zhang Xianyi
79120bf9a0 Refs #205. Merge boegel's codes about downloading LAPACK. 2013-05-24 15:29:10 +08:00
Zhang Xianyi
acb11905d5 Fixed #199. Saved USE_THREAD switch for make install. 2013-05-24 15:15:52 +08:00
Zhang Xianyi
109500178c Refs #220. Support Power7 by old Power6 kernels. 2013-05-21 22:59:45 +08:00
Zhang Xianyi
e50a664865 Refs #215. Fixed the compatible between <complex.h> and <complex> in C++. 2013-05-17 16:41:05 +08:00
Zhang Xianyi
357078b93e Refs #216. Revert the default value of GEMM_MULTITHREAD_THRESHOLD to 4. 2013-05-03 09:08:54 +08:00
wernsaar
731220f870 changed DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q to 248 for BULLDOZER 64bit 2013-04-30 10:07:17 +02:00
wernsaar
69aa6c8fb1 bad performance with some data 2013-04-28 11:14:23 +02:00
wernsaar
60b263f3d2 removed trsm_kernel_RT_4x4_bulldozer.S. wrong results 2013-04-27 17:23:08 +02:00
wernsaar
7ac306e0da added trsm_kernel_RT_4x4_bulldozer.S 2013-04-27 16:48:48 +02:00
wernsaar
4cb454cdf2 added trsm_kernel_LT_4x4_bulldozer.S 2013-04-27 14:30:00 +02:00
wernsaar
19ad2fb128 prefetch improved. Defined 2 different kernels for inner loop 2013-04-27 13:40:49 +02:00
Zhang Xianyi
5d96e4f224 Refs #210. Disable checking /lib/libpthread.so*. 2013-04-27 15:02:04 +08:00
wernsaar
6821677489 minor improvements and code cleanup 2013-04-26 20:05:42 +02:00
Xianyi Zhang
dbbda55e67 Updated the mailing list for OpenBLAS. 2013-04-25 00:45:42 +08:00
Xianyi Zhang
6c34a7f43c Updated the mailing list for OpenBLAS. 2013-04-25 00:44:22 +08:00
Zhang Xianyi
3326f3152c Merge pull request #213 from wernsaar/develop
Merged some improvements into dgemm_kernel_4x4_bulldozer.S.
2013-04-17 23:56:09 -07:00
wernsaar
7641f6e253 Merged some improvements into dgemm_kernel_4x4_bulldozer.S.
Changed the copy functions to generic to solve prefetch conflicts
2013-04-16 19:05:06 +02:00
Zhang Xianyi
48bdc1ad3b Added NO_PARALLEL_MAKE flag to disable parallel make. 2013-04-15 21:37:30 +08:00
Zhang Xianyi
3ad29452d1 Merge pull request #211 from wernsaar/develop
New version of dgemm_kernel_4x4_bulldozer.S
2013-04-15 00:20:55 -07:00
wernsaar
6e3f6f25a5 New version of dgemm_kernel_4x4_bulldozer.S
The peak performance with 8 cores is now 90 GFlops
2013-04-12 17:55:51 +02:00
Zhang Xianyi
990efcab6e Merge branch 'loongson3b' into loongson3a 2013-04-11 16:11:03 +00:00
Zhang Xianyi
75a5dc3975 Added the configure for the host loongcc compiling on Loongson3. 2013-04-11 16:10:47 +00:00
Xianyi Zhang
986d542acb Merge branch 'loongson3a' into loongson3b 2013-04-11 16:07:59 +08:00
Xianyi Zhang
6958c1a1aa Fixed the SEGFAULT bug with Loongcc and Loongson3. 2013-04-11 15:33:43 +08:00
Zhang Xianyi
a068d54981 Refs #209. Export the missing cblas_cdotc_sub functions. 2013-04-08 23:21:28 +08:00
Xianyi Zhang
d692ee07f7 Merge branch 'loongson3a' into loongson3b 2013-04-08 14:56:39 +08:00
Xianyi Zhang
1a57717b1a Added the configuration of Loongcc compiler for Loongson 3 CPU. 2013-04-07 15:42:07 +08:00
Xianyi Zhang
6b01d58712 Disable the optimization of muli-threading gemm on the Loongson3A. 2013-03-30 20:12:43 +00:00
Xianyi Zhang
35b943f17f Merge branch 'develop' into loongson3a 2013-03-27 14:36:15 +00:00
Zhang Xianyi
e029242870 Merge pull request #206 from wlbksy/patch-1
Fix #204 wget in mingw/msys sometimes download file with trailing name,
2013-03-23 09:57:41 -07:00
wlbksy
7a9b94b519 Fix #204 2013-03-23 14:41:26 +08:00
Kenneth Hoste
66b919d99f adjusted Makefile to allow for provided required LAPACK source files rather than downloading them 2013-03-22 19:45:11 +01:00
Zhang Xianyi
f4846afbad Merge pull request #201 from Explorer09/develop 2013-03-18 07:31:30 -07:00
Explorer09
53588bc786 getarch.c: Minor re-ordering of architecture list 2013-03-17 23:09:23 +08:00
Explorer09
b47f13ee4c getarch.c: Minor re-ordering of architecture list 2013-03-17 23:07:48 +08:00
Explorer09
309f90e563 TargetList.txt: minor re-ordering 2013-03-17 23:03:05 +08:00
Explorer09
773c01f496 Typo correction in README.md 2013-03-17 22:48:24 +08:00
Zhang Xianyi
d831b2ff8b Override CFLAGS in LAPACK make.in. 2013-03-10 01:01:16 +08:00
Zhang Xianyi
724ae159ce Fixed the Windows x86_64 ABI bug in s/daxpy kernels. 2013-03-08 22:28:34 +08:00
Zhang Xianyi
2c9a203bd1 Merge pull request #198 from wernsaar/develop
new optimization of dgemm kernel for bulldozer: 10% performance increase
2013-03-06 13:39:53 -08:00
wernsaar
f300ce3df5 new optimization of dgemm kernel for bulldozer: 10% performance increase 2013-03-06 17:26:03 +01:00
Zhang Xianyi
e2c7c75715 Merge pull request #197 from wernsaar/develop
optimized again bulldozer dgemm kernel
2013-03-06 01:11:08 -08:00
wernsaar
66e64131ed optimized again bulldozer dgemm kernel 2013-03-05 19:51:37 +01:00
Zhang Xianyi
5900b1462e Merge pull request #195 from wernsaar/develop
Develop dgemm for bullozer
2013-03-05 05:35:42 -08:00
wernsaar
9405f26f4b new dgemm_kernel for bulldozer 2013-03-04 17:37:38 +01:00
Zhang Xianyi
54e7b37630 Merge branch 'develop' 2013-03-02 14:42:06 +08:00
Zhang Xianyi
529f1b5006 Refs#194. Export the missing LAPACK s/dlamc3 functions. 2013-03-02 14:41:18 +08:00
Zhang Xianyi
e5ac3007e0 Merge branch 'develop' 2013-03-02 14:24:23 +08:00
Zhang Xianyi
0d0405b434 Updated the doc for 0.2.6 version. 2013-03-02 14:22:27 +08:00
Zhang Xianyi
f1ce74ffdd Improved the print when OS don't support AVX. 2013-03-02 14:15:54 +08:00
Zhang Xianyi
d744c9590a In OpenMP threading, preallocate the thread buffer instead of allocating the buffer every time. This patch improved the performance slightly. 2013-03-01 14:36:47 +08:00
Zhang Xianyi
3cc6ae793e Refs #174. Return sb pointer when OpenMP or Windows. 2013-02-26 00:48:21 +08:00
Zhang Xianyi
4c2123c334 Fixed the overflowing bug in single thread cholesky factorization. 2013-02-23 13:00:52 +08:00
Zhang Xianyi
5155e3f509 Refs #174. Fixed the overflowing buffer bug of multithreading hbmv and sbmv.
Instead of using thread 0 buffer, each thread uses its own sb buffer.
Thus, it can avoid overflowing thread 0 buffer.
2013-02-13 16:05:58 +08:00
Zhang Xianyi
5c8bf6ae0e Merge branch 'bulldozer' into develop 2013-02-10 01:19:42 +08:00
Zaheer Chothia
a9500d0079 Missing line continuation -- follow-up to last commit (64ad8b9809). 2013-02-01 09:34:12 +01:00
Zaheer Chothia
64ad8b9809 Refs #193. Don't use C99 complex numbers when building C++ code. 2013-02-01 09:24:44 +01:00
Zaheer Chothia
875d520ccf Refs #193. cblas: move #include out of extern "C" block.
Standard headers may contain C++ templates which are not permitted inside an
extern "C" block. This might be the case when we include <complex.h>.
2013-01-31 08:48:27 +01:00
Zhang Xianyi
d311236dfd Refs #189. Fixed the bug of s/cdot about invalid reading NAN on x86_64. 2013-01-25 20:56:14 +08:00
Zhang Xianyi
36e0982966 Refs #187. Use perl to generate cblas_noconst.h instead of sed.
Thank Dan Povey's patch. https://github.com/xianyi/OpenBLAS/issues/187
2013-01-22 00:29:54 +08:00
Zhang Xianyi
8cdb795438 Refs #187. Use binary code for xgetbv, which is compatible with old compiler. 2013-01-22 00:25:08 +08:00
Zaheer Chothia
4db6660de4 Refs #185. Add missing 'const' to declarations in <cblas.h>. Thanks to Dan Povey!
The 'const' modifications were done automatically using this scripts:
https://kaldi.svn.sourceforge.net/svnroot/kaldi/sandbox/dan/tools/for_openblas
2013-01-20 22:52:51 +01:00
Zhang Xianyi
0b08f7479e Refs #154. Fixed gemv_t bug about overflow 16MB buffer on x86. 2013-01-20 21:22:12 +08:00
Zaheer Chothia
200e4acf15 cblas: typedef enums for improved compatibility with Intel MKL.
Netlib style:
    enum CBLAS_XYZ {X=1, Y=2, Z=3};

Intel MKL style:
    typedef enum {X=1, Y=2, Z=3} CBLAS_XYZ;

With this hybrid style, code written in the latter form won't need any
modifications to be built with OpenBLAS.  This change should not affect existing
code, although a warning may be emitted for C code which does the following
(does not occur with C++):
    typedef enum CBLAS_XYZ CBLAS_XYZ;
    warning: redefinition of typedef 'CBLAS_XYZ' [-pedantic]
2013-01-19 22:57:13 +01:00
Zhang Xianyi
99d1978df7 Fixed #180. the typos in kernel/x86_64/sgemv_t.S 2013-01-12 12:31:14 +08:00
Zhang Xianyi
08bf6674d5 Refs #177. Fixed sgemv_t compiling bug on Win64. 2013-01-05 11:36:39 +08:00
Zhang Xianyi
8b122ff9dc Refs #176. Fixed make.inc overriding RANLIB bug when cross-compiling LAPACK. 2013-01-03 01:47:31 +08:00
Zhang Xianyi
69200884e1 Refs #173. Fixed overflow internal buffer bug of gemv_n on x86 2012-12-25 09:27:49 +08:00
Zhang Xianyi
0d1518add9 Refs #173. Fixed overflow internal buffer bug of sgemv_t on x86 2012-12-25 09:10:17 +08:00
Zhang Xianyi
91ed4e4450 Refs #171. Prevent loading the dirty number from the buffer in sgemv_t x86 kernel. 2012-12-23 23:14:17 +08:00
Zhang Xianyi
fd3046b32a Refs #173. Fixed overflow internal buffer bug of gemv_t on x86. 2012-12-23 21:47:22 +08:00
Zhang Xianyi
a4ee6f3915 Fixed #172. Support Intel Xeon E7540. 2012-12-18 08:57:46 +08:00
Zhang Xianyi
a0363e9b48 Merge branch 'master' into develop 2012-12-18 08:51:30 +08:00
Zhang Xianyi
b471d52e61 Merge pull request #170 from juliantaylor/athlon-defaults
set parameters for CORE_ATHLON
2012-12-15 15:50:02 -08:00
Julian Taylor
9fb341a9f8 set parameters for CORE_ATHLON
else dgemm_p is set to zero leading to a segfault in alloc_mmap due to
allocsize being zero
2012-12-15 16:05:33 +01:00
Zhang Xianyi
fba6b590f2 Merge branch 'master' into develop 2012-12-15 22:49:37 +08:00
Zhang Xianyi
97f68f7f3a Merge pull request #169 from juliantaylor/sanity-check-cpu
add a sanity check on the detected cpu type
2012-12-15 06:46:48 -08:00
Julian Taylor
1138817dd2 add a sanity check on the detected cpu type
if we have 64 bit pointers we can't have a 32 bit cpu, so fall back to
the 64bit cpu fallback (prescott)
E.g. the cpu detection fails in amd qemu64 emulation (family 6 model 2)
causing it to use the uninitialized gotoblas_ATHLON
2012-12-15 13:29:46 +01:00
Zhang Xianyi
13f8fc0b1a Write FMA4 flag to the configure file. 2012-12-11 10:55:10 +01:00
Zhang Xianyi
bdf8d9411e Refs #163. Obtain the build configure on runtime.
openblas_get_config function returns the configure string.
So far, it supports USE64BITINT, NO_CBLAS, NO_LAPACK, NO_LAPACKE,
DYNAMIC_ARCH, NO_AFFINITY.

Example:
 #include <stdio.h>
extern char * openblas_get_config();
void main()
{
  printf("%s\n",openblas_get_config());
  return;
}
2012-12-10 15:52:51 +08:00
Zhang Xianyi
bb10cb8442 Refs #165. fall back of DTB_DEFAULT_ENTRIES for some virtual machines. 2012-12-10 11:51:39 +08:00
Zhang Xianyi
f19af5ecc0 Refs #54. Added AMD Bulldozer x86_64 dgemm kernel developed by Werner Saar <wernsaar at googlemail.com>
Based on the dgemm kernel for AMD Barcelona, he used AVX and FMA4 instructions.
Thank Werner Saar!
2012-12-07 01:05:11 +08:00
Zhang Xianyi
bfaaa975e6 Added BULLDOZER target. So far it uses barcelona kernels. 2012-12-07 00:53:31 +08:00
Zhang Xianyi
b7c0fa6bd2 Init AMD Bulldozer codebase. 2012-12-06 07:29:54 -05:00
Zhang Xianyi
7110d17146 Added -lgomp for generating DLL on Windows. 2012-11-28 12:52:28 +08:00
Zhang Xianyi
e01b3d4b54 Merge branch 'develop' 2012-11-27 07:24:53 +08:00
Zhang Xianyi
cea1a885b5 Refs #154. Fixed the build bug of dgemv_t on MinW64. 2012-11-27 07:24:04 +08:00
Zhang Xianyi
f78eb335d6 Merge branch 'develop' 2012-11-26 17:32:56 +08:00
Zhang Xianyi
2345bdec68 Update the doc for 0.2.5 version. 2012-11-26 17:32:25 +08:00
Zhang Xianyi
5f0117385e Refs #154. Fixed a SEGFAULT bug of dgemv_t when m is very large.
It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large.

Thank @wangqian for this patch.
2012-11-19 22:32:27 +08:00
Zhang Xianyi
6caf1bab73 Fixed #160. Merge branch 'master' of https://github.com/sebastien-villemot/OpenBLAS into develop 2012-11-15 18:21:04 +08:00
Sébastien Villemot
01e3c984ce Fix compilation with TARGET=GENERIC
Patch applied to Debian package
2012-11-14 21:04:05 +01:00
Zhang Xianyi
6751f7b9a7 Fixed #157. Only detect the number of physical CPU cores on Mac OSX. 2012-11-13 15:48:57 +08:00
Zhang Xianyi
d5717a97ea Compile lapacke with ILP64 modle when INTERFACE64=1 2012-11-13 00:54:20 +08:00
Zhang Xianyi
b45d43d295 Added the patch for lapacke example. 2012-11-13 00:53:26 +08:00
Zhang Xianyi
dcfb69c2b5 Merge branch 'master' of https://github.com/alnsn/OpenBLAS into develop 2012-11-12 11:17:04 +08:00
Alexander Nasonov
e85549ee11 Fix NetBSD build. 2012-11-10 23:20:44 +00:00
Zhang Xianyi
789f205177 Improved Makefile.rule for cross compiler. 2012-11-09 00:14:20 +08:00
Zhang Xianyi
378acfe826 Added NO_SHARED flag to disable generating the shared library. 2012-11-09 00:14:15 +08:00
Zhang Xianyi
538c764d2b Refs #153. Restore the original CPU affinity when calling openblas_set_num_threads(1).
Please read the issue on github.com for the detail.
2012-11-06 18:21:46 +08:00
Zaheer Chothia
0f26a21624 Alternative approach to avoid command-line length while archiving lapacke -- Thanks Michel! 2012-10-15 22:26:18 +02:00
Zaheer Chothia
5c1efa1149 Fix installation step on Windows (regression from e8306f623a)
Since the DLL now has a fixed name there is no need to install a versioned alias too.
2012-10-15 22:13:37 +02:00
Zaheer Chothia
ca4136cf41 Fixed #147: LAPACK symbols were not being exported for version 3.4.2 2012-10-12 23:44:23 +02:00
Zhang Xianyi
3a26470fb7 Merge branch 'develop' 2012-10-09 20:08:28 +08:00
Zhang Xianyi
6c5899dff5 Don't use xgetbv instruction when NO_AVX=1 2012-10-09 14:52:35 +08:00
Zhang Xianyi
2df2878dfc Merge branch 'develop' 2012-10-08 13:38:03 +08:00
Zhang Xianyi
0b719945c5 Updated the doc for 0.2.4 version. 2012-10-08 13:37:44 +08:00
Zhang Xianyi
b1a54a0107 Fixed #141. make f77blas.h compatible with compilers which lack C99 complex number.
Apply the patch from Tony @tonyhill. Thank you.
2012-10-08 12:48:20 +08:00
Zhang Xianyi
08c177ca36 Refs #145. Update LAPACK to 3.4.2 version. 2012-09-29 23:14:39 +08:00
Zhang Xianyi
2573311308 refs #140. Fixed zdot incompatibility ABI issue with GCC 4.7 on Win 32.
GCC 4.7 uses MSVC ABI on Win 32. This means the caller pops the hidden pointer for returning
aggregate structures larger than 8 bytes.
2012-09-24 20:34:33 +08:00
Zhang Xianyi
1d72b8bf1b Fixed generating shared library bug on MIPS. 2012-09-21 11:49:07 +00:00
Zhang Xianyi
758e34efbb Fixed the detection bug on Loongson 3A server. 2012-09-21 10:14:07 +00:00
Zhang Xianyi
735ca38b8f Refs #139. Check OS supporting AVX on runtime. 2012-09-18 15:46:20 +08:00
Zhang Xianyi
f76a384841 Refs #139. Added NO_AVX flag to use old Nehalem kernels on Sandy Bridge.
For example, make NO_AVX=1 or make DYNAMIC_ARCH=1 NO_AVX=1
2012-09-17 23:25:46 +08:00
Zhang Xianyi
9419a43a7f Fixed #142. Added the gesvd and potrs function families to common_interface.h. 2012-09-14 15:15:08 +08:00
Zhang Xianyi
b695680a33 Fixed #143. Don't generate cblas.h with NO_CBLAS. 2012-09-14 14:06:14 +08:00
Jameson Nash
d0e731e8b8 provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to make on the command line 2012-08-21 00:31:12 -04:00
Zhang Xianyi
48f075cfd5 Merge branch 'develop' 2012-08-20 16:52:35 +08:00
Zhang Xianyi
3e87648de3 Updated the doc for 0.2.3 version. 2012-08-20 16:51:47 +08:00
Zhang Xianyi
fe4ab95cd5 Refs #136. Fixed a bug about controlling the number of threads on Windows. 2012-08-19 23:50:54 +08:00
Xianyi Zhang
801383effe Fixed a hang bug when shutdown blas threads server on Windows. Added the feature about dynamic changing the number of threads on Windows. 2012-08-14 18:34:32 +08:00
Zhang Xianyi
54cd65e47f Use sandy bridge kernel when DYNAMIC_ARCH=1. 2012-08-13 15:25:08 +08:00
Zhang Xianyi
a55821a2ec Refs #132. Kill the threads when unload the library. 2012-08-11 21:33:15 +08:00
Zhang Xianyi
068861a927 Refs #133. Users can set COMMON_OPT flag to control CFLAGS and FFLAGS. 2012-08-10 14:36:26 +08:00
Zhang Xianyi
d007cca61d Refs #134. Fixed the building bug on IBM Power. 2012-08-10 11:54:21 +08:00
Zhang Xianyi
a92895939e Added the tip for Windows. 2012-08-09 20:37:55 +08:00
Zhang Xianyi
7bd1834d59 Refs #130 Fixed laswp building bug with DYNAMIC_ARCH=1. 2012-08-09 20:36:29 +08:00
Zhang Xianyi
1b056c5328 Refs #130 Prevent reading ipiv array beyond the bound in ?laswp. Use laswp instead of laswp_oncopy in getrf. 2012-08-09 20:06:51 +08:00
Zaheer Chothia
e8306f623a Refs #127. Generate DLL without a version suffix on Windows. 2012-07-30 19:46:30 +02:00
Xianyi Zhang
3108a1853d Added the doc for the conflict with R parallel. 2012-07-13 14:19:30 +08:00
Xianyi Zhang
25f1a573fd Fixed the build bug when DYNAMIC_ARCH=0. 2012-07-07 12:12:24 +08:00
5484 changed files with 1476424 additions and 1800 deletions

6
.gitignore vendored
View File

@@ -4,10 +4,16 @@
*.dylib
*.def
*.o
*.out
lapack-3.1.1
lapack-3.1.1.tgz
lapack-3.4.1
lapack-3.4.1.tgz
lapack-3.4.2
lapack-3.4.2.tgz
lapack-netlib/make.inc
lapack-netlib/lapacke/include/lapacke_mangling.h
lapack-netlib/TESTING/testing_results.txt
*.so
*.a
.svn

24
.travis.yml Normal file
View File

@@ -0,0 +1,24 @@
language: c
compiler:
- gcc
env:
- TARGET_BOX=LINUX64 BTYPE="BINARY=64"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 USE_OPENMP=1"
- TARGET_BOX=LINUX64 BTYPE="BINARY=64 INTERFACE64=1"
- TARGET_BOX=LINUX32 BTYPE="BINARY=32"
- TARGET_BOX=WIN64 BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran"
before_install:
- sudo apt-get update -qq
- sudo apt-get install -qq gfortran
- if [[ "$TARGET_BOX" == "WIN64" ]]; then sudo apt-get install -qq binutils-mingw-w64-x86-64 gcc-mingw-w64-x86-64 gfortran-mingw-w64-x86-64; fi
- if [[ "$TARGET_BOX" == "LINUX32" ]]; then sudo apt-get install -qq gcc-multilib gfortran-multilib; fi
script: make QUIET_MAKE=1 DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32 $BTYPE
# whitelist
branches:
only:
- master
- develop

87
CONTRIBUTORS.md Normal file
View File

@@ -0,0 +1,87 @@
# Contributions to the OpenBLAS project
## Creator & Maintainer
* Zhang Xianyi <traits.zhang@gmail.com>
## Active Developers
* Wang Qian <traz0824@gmail.com>
* Optimize BLAS3 on ICT Loongson 3A.
* Optimize BLAS3 on Intel Sandy Bridge.
* Zaheer Chothia <zaheer.chothia@gmail.com>
* Improve the compatibility about complex number
* Build LAPACKE: C interface to LAPACK
* Improve the windows build.
## Previous Developers
* Chen Shaohu <huhumartinwar@gmail.com>
* Optimize GEMV on the Loongson 3A processor.
* Luo Wen
* Intern. Test Level-2 BLAS.
## Contributors
In chronological order:
* pipping <http://page.mi.fu-berlin.de/pipping>
* [2011-06-11] Make USE_OPENMP=0 disable openmp.
* Stefan Karpinski <stefan@karpinski.org>
* [2011-12-28] Fix a bug about SystemStubs on Mac OS X.
* Alexander Eberspächer <https://github.com/aeberspaecher>
* [2012-05-02] Add note on patch for segfaults on Linux kernel 2.6.32.
* Mike Nolta <mike@nolta.net>
* [2012-05-19] Fix building bug on FreeBSD and NetBSD.
* Sylvestre Ledru <https://github.com/sylvestre>
* [2012-07-01] Improve the detection of sparc. Fix building bug under
Hurd and kfreebsd.
* Jameson Nash <https://github.com/vtjnash>
* [2012-08-20] Provide support for passing CFLAGS, FFLAGS, PFLAGS, FPFLAGS to
make on the command line.
* Alexander Nasonov <alnsn@yandex.ru>
* [2012-11-10] Fix NetBSD build.
* Sébastien Villemot <sebastien@debian.org>
* [2012-11-14] Fix compilation with TARGET=GENERIC. Patch applied to Debian package.
* Werner Saar <wernsaar@googlemail.com>
* [2013-03-04] Optimize AVX and FMA4 DGEMM on AMD Bulldozer
* [2013-04-27] Optimize AVX and FMA4 TRSM on AMD Bulldozer
* [2013-06-09] Optimize AVX and FMA4 SGEMM on AMD Bulldozer
* [2013-06-11] Optimize AVX and FMA4 ZGEMM on AMD Bulldozer
* [2013-06-12] Optimize AVX and FMA4 CGEMM on AMD Bulldozer
* [2013-06-16] Optimize dgemv_n kernel on AMD Bulldozer
* [2013-06-20] Optimize ddot, daxpy kernel on AMD Bulldozer
* [2013-06-21] Optimize dcopy kernel on AMD Bulldozer
* Kang-Che Sung <Explorer09@gmail.com>
* [2013-05-17] Fix typo in the document. Re-order the architecture list in getarch.c.
* Kenneth Hoste <kenneth.hoste@gmail.com>
* [2013-05-22] Adjust Makefile about downloading LAPACK source files.
* Lei WANG <https://github.com/wlbksy>
* [2013-05-22] Fix a bug about wget.
* Dan Luu <http://www.linkedin.com/in/danluu>
* [2013-06-30] Add Intel Haswell support (using sandybridge optimizations).
* grisuthedragon <https://github.com/grisuthedragon>
* [2013-07-11] create openblas_get_parallel to retrieve information which parallelization
model is used by OpenBLAS.
* Sébastien Fabbro <bicatali@gentoo.org>
* [2013-07-24] Modify makefile to respect user's LDFLAGS
* [2013-07-24] Add stack markings for GNU as arch-independent for assembler files
* [Your name or handle] <[email or website]>
* [Date] [Brief summary of your changes]

View File

@@ -1,4 +1,111 @@
OpenBLAS ChangeLog
====================================================================
Version 0.2.8
01-Aug-2013
common:
* Support Open64 5.0. (#266)
* Add executable stack markings. (#262, Thank Sébastien Fabbro)
* Respect user's LDFLAGS (Thank Sébastien Fabbro)
x86/x86-64:
* Rollback bulldozer and piledriver kernels to barcelona kernels (#263)
We will fix the compuational error bug in bulldozer and piledriver kernels.
====================================================================
Version 0.2.7
20-Jul-2013
common:
* Support LSB (Linux Standard Base) 4.1.
e.g. make CC=lsbcc
* Include LAPACK 3.4.2 source codes to the repo.
Avoid downloading at compile time.
* Add NO_PARALLEL_MAKE flag to disable parallel make.
* Create openblas_get_parallel to retrieve information which
parallelization model is used by OpenBLAS. (Thank grisuthedragon)
* Detect LLVM/Clang compiler. The default compiler is Clang on Mac OS X.
* Change LIBSUFFIX from .lib to .a on windows.
* A walk round for dtrti_U single thread bug. Replace it with LAPACK codes. (#191)
x86/x86-64:
* Optimize c/zgemm, trsm, dgemv_n, ddot, daxpy, dcopy on
AMD Bulldozer. (Thank Werner Saar)
* Add Intel Haswell support (using Sandybridge optimizations).
(Thank Dan Luu)
* Add AMD Piledriver support (using Bulldozer optimizations).
* Fix the computational error in zgemm avx kernel on
Sandybridge. (#237)
* Fix the overflow bug in gemv.
* Fix the overflow bug in multi-threaded BLAS3, getrf when NUM_THREADS
is very large.(#214, #221, #246).
MIPS64:
* Support loongcc (Open64 based) compiler for ICT Loongson 3A/B.
Power:
* Support Power7 by old Power6 kernels. (#220)
====================================================================
Version 0.2.6
2-Mar-2013
common:
* Improved OpenMP performance slightly. (d744c9)
* Improved cblas.h compatibility with Intel MKL.(#185)
* Fixed the overflowing bug in single thread cholesky factorization.
* Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174)
x86/x86-64:
* Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
We will tune the performance in future.
* Auto-detect Intel Xeon E7540.
* Fixed the overflowing buffer bug of gemv. (#173)
* Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189)
MIPS64:
====================================================================
Version 0.2.5
26-Nov-2012
common:
* Added NO_SHARED flag to disable generating the shared library.
* Compile LAPACKE with ILP64 modle when INTERFACE64=1 (#158)
* Export LAPACK 3.4.2 symbols in shared library. (#147)
* Only detect the number of physical CPU cores on Mac OSX. (#157)
* Fixed NetBSD build. (#155)
* Fixed compilation with TARGET=GENERIC. (#160)
x86/x86-64:
* Restore the original CPU affinity when calling
openblas_set_num_threads(1) (#153)
* Fixed a SEGFAULT bug in dgemv_t when m is very large.(#154)
MIPS64:
====================================================================
Version 0.2.4
8-Oct-2012
common:
* Upgraded LAPACK to 3.4.2 version. (#145)
* Provided support for passing CFLAGS, FFLAGS, PFLAGS,
FPFLAGS to make. (#137)
* f77blas.h:compatibility for compilers without C99 complex
number support. (#141)
x86/x86-64:
* Added NO_AVX flag. Check OS supporting AVX on runtime. (#139)
* Fixed zdot incompatibility ABI issue with GCC 4.7 on
Windows 32-bit. (#140)
MIPS64:
* Fixed the generation of shared library bug.
* Fixed the detection bug on the Loongson 3A server.
====================================================================
Version 0.2.3
20-Aug-2012
common:
* Fixed LAPACK unstable bug about ?laswp. (#130)
* Fixed the shared library bug about unloading the library on
Linux (#132).
* Fixed the compilation failure on BlueGene/P (TARGET=PPC440FP2)
Please use gcc and IBM xlf. (#134)
x86/x86-64:
* Supported goto_set_num_threads and openblas_set_num_threads
APIs in Windows. They can set the number of threads on runtime.
====================================================================
Version 0.2.2
6-July-2012

115
Makefile
View File

@@ -3,7 +3,7 @@ include ./Makefile.system
BLASDIRS = interface driver/level2 driver/level3 driver/others
ifndef DYNAMIC_ARCH
ifneq ($(DYNAMIC_ARCH), 1)
BLASDIRS += kernel
endif
@@ -80,30 +80,30 @@ endif
@echo
shared :
ifndef NO_SHARED
ifeq ($(OSNAME), Linux)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
$(MAKE) -C exports so
-ln -fs $(LIBSONAME) $(LIBPREFIX).so
@$(MAKE) -C exports so
@-ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
$(MAKE) -C exports dyn
-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
@$(MAKE) -C exports dyn
@-ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
$(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
@$(MAKE) -C exports dll
endif
ifeq ($(OSNAME), CYGWIN_NT)
$(MAKE) -C exports dll
-ln -fs $(LIBDLLNAME) $(LIBPREFIX).dll
@$(MAKE) -C exports dll
endif
endif
tests :
@@ -131,30 +131,33 @@ endif
ifeq ($(NOFORTRAN), 1)
$(error OpenBLAS: Detecting fortran compiler failed. Please install fortran compiler, e.g. gfortran, ifort, openf90.)
endif
-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
for d in $(SUBDIRS) ; \
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
#Save the config files for installation
cp Makefile.conf Makefile.conf_last
cp config.h config_last.h
@cp Makefile.conf Makefile.conf_last
@cp config.h config_last.h
ifdef QUAD_PRECISION
echo "#define QUAD_PRECISION">> config_last.h
@echo "#define QUAD_PRECISION">> config_last.h
endif
ifeq ($(EXPRECISION), 1)
echo "#define EXPRECISION">> config_last.h
@echo "#define EXPRECISION">> config_last.h
endif
##
ifdef DYNAMIC_ARCH
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
ifeq ($(DYNAMIC_ARCH), 1)
@$(MAKE) -C kernel commonlibs || exit 1
@for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
done
echo DYNAMIC_ARCH=1 >> Makefile.conf_last
@echo DYNAMIC_ARCH=1 >> Makefile.conf_last
endif
touch lib.grd
ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
endif
@touch lib.grd
prof : prof_blas prof_lapack
@@ -165,7 +168,7 @@ prof_blas :
$(MAKE) -C $$d prof || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonprof || exit 1
endif
@@ -184,7 +187,7 @@ hpl :
$(MAKE) -C $$d $(@F) || exit 1 ; \
fi; \
done
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
$(MAKE) -C kernel commonlibs || exit 1
for d in $(DYNAMIC_CORE) ; \
do $(MAKE) GOTOBLAS_MAKEFILE= -C kernel TARGET_CORE=$$d kernel || exit 1 ;\
@@ -203,19 +206,19 @@ ifeq ($(NO_LAPACK), 1)
netlib :
else
netlib : lapack-3.4.1 patch.for_lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc
netlib : lapack_prebuild
ifndef NOFORTRAN
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib
endif
ifndef NO_LAPACKE
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib
endif
endif
prof_lapack : lapack-3.4.1 $(NETLIB_LAPACK_DIR)/make.inc
-@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
prof_lapack : lapack_prebuild
@$(MAKE) -C $(NETLIB_LAPACK_DIR) lapack_prof
$(NETLIB_LAPACK_DIR)/make.inc :
lapack_prebuild :
ifndef NOFORTRAN
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-@echo "OPTS = $(FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -224,7 +227,7 @@ ifndef NOFORTRAN
-@echo "PNOOPT = $(FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CFLAGS = $(CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "ARCHFLAGS = -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -233,52 +236,56 @@ ifndef NOFORTRAN
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
# -@echo "CEXTRALIB = $(CEXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "CEXTRALIB = $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc
endif
lapack-3.4.1 : lapack-3.4.1.tgz
lapack-3.4.2 : lapack-3.4.2.tgz
ifndef NOFORTRAN
ifndef NO_LAPACK
@if test `$(MD5SUM) lapack-3.4.1.tgz | $(AWK) '{print $$1}'` = 44c3869c38c8335c2b9c2a8bb276eb55; then \
@if test `$(MD5SUM) $< | $(AWK) '{print $$1}'` = 61bf1a8a4469d4bdb7604f5897179478; then \
echo $(TAR) zxf $< ;\
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.1) ;\
$(TAR) zxf $< && (cd $(NETLIB_LAPACK_DIR); $(PATCH) -p1 < ../patch.for_lapack-3.4.2) ;\
rm -f $(NETLIB_LAPACK_DIR)/lapacke/make.inc ;\
else \
rm -rf $(NETLIB_LAPACK_DIR) ;\
echo " Cannot download lapack-3.4.1.tgz or the MD5 check sum is wrong (Please use orignal)."; \
echo " Cannot download lapack-3.4.2.tgz or the MD5 check sum is wrong (Please use orignal)."; \
exit 1; \
fi
endif
endif
LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz
LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.2.tgz
lapack-3.4.1.tgz :
lapack-3.4.2.tgz :
ifndef NOFORTRAN
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD))
curl -O $(LAPACK_URL)
curl -O $(LAPACK_URL);
else
ifeq ($(OSNAME), FreeBSD)
fetch $(LAPACK_URL)
fetch $(LAPACK_URL);
else
wget $(LAPACK_URL)
wget -O $@ $(LAPACK_URL);
endif
endif
endif
large.tgz :
ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/large.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/large.tgz;
fi
endif
timing.tgz :
ifndef NOFORTRAN
-wget http://www.netlib.org/lapack/timing/timing.tgz
if [ ! -a $< ]; then
-wget http://www.netlib.org/lapack/timing/timing.tgz;
fi
endif
lapack-timing : lapack-3.4.1 large.tgz timing.tgz
lapack-timing : large.tgz timing.tgz
ifndef NOFORTRAN
(cd $(NETLIB_LAPACK_DIR); $(TAR) zxf ../timing.tgz TIMING)
(cd $(NETLIB_LAPACK_DIR)/TIMING; $(TAR) zxf ../../large.tgz )
@@ -310,10 +317,12 @@ clean ::
#endif
@$(MAKE) -C reference clean
@rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h
@rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib
@if test -d $(NETLIB_LAPACK_DIR); then \
echo deleting $(NETLIB_LAPACK_DIR); \
rm -rf $(NETLIB_LAPACK_DIR) ;\
fi
ifeq ($(OSNAME), Darwin)
@rm -rf getarch.dSYM getarch_2nd.dSYM
endif
@rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib
@touch $(NETLIB_LAPACK_DIR)/make.inc
@$(MAKE) -C $(NETLIB_LAPACK_DIR) clean
@rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h
@rm -f *.grd Makefile.conf_last config_last.h
@echo Done.
@echo Done.

View File

@@ -1,6 +1 @@
COPT = -Wall -O2 # -DGEMMTEST
ifdef BINARY64
else
# LDFLAGS = -m elf32ppc
LDFLAGS = -m elf_i386
endif

View File

@@ -5,6 +5,7 @@ include ./Makefile.system
OPENBLAS_INCLUDE_DIR:=$(PREFIX)/include
OPENBLAS_LIBRARY_DIR:=$(PREFIX)/lib
OPENBLAS_BUILD_DIR:=$(CURDIR)
.PHONY : install
.NOTPARALLEL : install
@@ -32,8 +33,10 @@ install : lib.grd
@cat common_interface.h >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
@echo \#endif >> $(OPENBLAS_INCLUDE_DIR)/f77blas.h
ifndef NO_CBLAS
@echo Generating cblas.h in $(OPENBLAS_INCLUDE_DIR)
@sed 's/common/openblas_config/g' cblas.h > $(OPENBLAS_INCLUDE_DIR)/cblas.h
endif
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(OPENBLAS_LIBRARY_DIR)
@@ -46,34 +49,36 @@ endif
#for install static library
@echo Copy the static library to $(OPENBLAS_LIBRARY_DIR)
@cp $(LIBNAME) $(OPENBLAS_LIBRARY_DIR)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).$(LIBSUFFIX)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
#for install shared library
@echo Copy the shared library to $(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), Linux)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so.$(MAJOR_VERSION)
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
-cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).so
@cp $(LIBSONAME) $(OPENBLAS_LIBRARY_DIR)
@cd $(OPENBLAS_LIBRARY_DIR) ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), Darwin)
-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
@-cp $(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)
@-install_name_tool -id $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)
@-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dylib
endif
ifeq ($(OSNAME), WINNT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif
ifeq ($(OSNAME), CYGWIN_NT)
-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
-ln -fs $(OPENBLAS_LIBRARY_DIR)/$(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)/$(LIBPREFIX).dll
@-cp $(LIBDLLNAME) $(OPENBLAS_LIBRARY_DIR)
endif
@echo Install OK!

View File

@@ -17,13 +17,7 @@ endif
endif
ifdef BINARY64
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf64ppc
endif
ifeq ($(OSNAME), Darwin)
LDFLAGS = -arch ppc64
endif
ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -mpowerpc64 -maix64
@@ -34,17 +28,12 @@ ifeq ($(COMPILER_F77), xlf)
FCOMMON_OPT += -q64
endif
ARFLAGS = -X 64
LDFLAGS = -b64
ASFLAGS = -a64
endif
else
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf32ppc
endif
ifeq ($(OSNAME), AIX)
CCOMMON_OPT += -Wa,-a32
ARFLAGS = -X 32
LDFLAGS = -b32
ASFLAGS = -a32
endif
endif

View File

@@ -1,3 +1,5 @@
# This is triggered by Makefile.system and runs before any of the code is built.
export BINARY
export USE_OPENMP
@@ -15,7 +17,7 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
all: getarch_2nd
all: getarch_2nd cblas_noconst.h
./getarch_2nd 0 >> $(TARGET_MAKE)
./getarch_2nd 1 >> $(TARGET_CONF)
@@ -36,4 +38,7 @@ else
$(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c
endif
cblas_noconst.h : cblas.h
perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h
dummy:

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.2.2
VERSION = 0.2.8
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -24,10 +24,13 @@ VERSION = 0.2.2
# Fortran compiler. Default is g77.
# FC = gfortran
# Even you can specify cross compiler
# Even you can specify cross compiler. Meanwhile, please set HOSTCC.
# CC = x86_64-w64-mingw32-gcc
# FC = x86_64-w64-mingw32-gfortran
# If you use the cross compiler, please set this host compiler.
# HOSTCC = gcc
# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# BINARY=64
@@ -45,6 +48,9 @@ VERSION = 0.2.2
# automatically detected by the the script.
# NUM_THREADS = 24
# if you don't need generate the shared library, please comment it in.
# NO_SHARED = 1
# If you don't need CBLAS interface, please comment it in.
# NO_CBLAS = 1
@@ -71,6 +77,13 @@ VERSION = 0.2.2
# If you want to disable CPU/Memory affinity on Linux.
# NO_AFFINITY = 1
# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
# and OS. However, the performance is low.
# NO_AVX = 1
# Don't use parallel make.
# NO_PARALLEL_MAKE = 1
# If you would like to know minute performance report of GotoBLAS.
# FUNCTION_PROFILE = 1
@@ -94,8 +107,8 @@ VERSION = 0.2.2
# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute
# with single thread. You can use this flag to avoid the overhead of multi-threading
# in small matrix sizes. The default value is 50.
# GEMM_MULTITHREAD_THRESHOLD = 50
# in small matrix sizes. The default value is 4.
# GEMM_MULTITHREAD_THRESHOLD = 4
# If you need santy check by comparing reference BLAS. It'll be very
# slow (Not implemented yet).
@@ -108,19 +121,16 @@ VERSION = 0.2.2
# The installation directory.
# PREFIX = /opt/OpenBLAS
# Common Optimization Flag; -O2 is enough.
# DEBUG = 1
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
# -DDEBUG
else
COMMON_OPT += -O2
endif
# Common Optimization Flag;
# The default -O2 is enough.
# COMMON_OPT = -O2
# Profiling flags
COMMON_PROF = -pg
# Build Debug version
# DEBUG = 1
#
# End of user configuration
#

View File

@@ -10,7 +10,6 @@ endif
ifeq ($(COMPILER_F77), f90)
FCOMMON_OPT += -xarch=v9
endif
LDFLAGS = -64
else
CCOMMON_OPT += -mcpu=v9

View File

@@ -9,9 +9,7 @@ ifndef TOPDIR
TOPDIR = .
endif
ifndef NETLIB_LAPACK_DIR
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1
endif
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# Default C compiler
# - Only set if not specified on the command line or inherited from the environment.
@@ -20,6 +18,12 @@ endif
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
endif
endif
# Default Fortran compiler (FC) is selected by f_check.
@@ -53,16 +57,37 @@ GETARCH_FLAGS += -DUSE64BITINT
endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=50
GEMM_MULTITHREAD_THRESHOLD=4
endif
GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD)
ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
ifeq ($(QUIET_MAKE), 1)
MAKE += -s
endif
ifndef NO_PARALLEL_MAKE
NO_PARALLEL_MAKE=0
endif
GETARCH_FLAGS += -DNO_PARALLEL_MAKE=$(NO_PARALLEL_MAKE)
ifeq ($(HOSTCC), loongcc)
GETARCH_FLAGS += -static
endif
# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all)
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
@@ -121,7 +146,7 @@ MD5SUM = md5 -r
endif
ifeq ($(OSNAME), NetBSD)
MD5SUM = md5 -r
MD5SUM = md5 -n
endif
ifeq ($(OSNAME), Linux)
@@ -140,7 +165,39 @@ EXTRALIB += -defaultlib:advapi32
SUFFIX = obj
PSUFFIX = pobj
LIBSUFFIX = lib
LIBSUFFIX = a
ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(GCCVERSIONGTEQ4), 1)
ifeq ($(GCCMINORVERSIONGTEQ7), 1)
# GCC Version >=4.7
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
endif
endif
# Ensure the correct stack alignment on Win32
# http://permalink.gmane.org/gmane.comp.lib.openblas.general/97
ifeq ($(ARCH), x86)
CCOMMON_OPT += -mincoming-stack-boundary=2
FCOMMON_OPT += -mincoming-stack-boundary=2
endif
endif
ifeq ($(OSNAME), Interix)
@@ -195,11 +252,17 @@ NO_BINARY_MODE = 1
endif
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
endif
endif
@@ -207,11 +270,17 @@ endif
ifeq ($(ARCH), x86_64)
ifndef NO_EXPRECISION
ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION -m128bit-long-double
FCOMMON_OPT += -m128bit-long-double
endif
ifeq ($(C_COMPILER), CLANG)
EXPRECISION = 1
CCOMMON_OPT += -DEXPRECISION
FCOMMON_OPT += -m128bit-long-double
endif
endif
endif
endif
@@ -221,7 +290,13 @@ CCOMMON_OPT += -wd981
endif
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or LSB
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB))
CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), CLANG)
$(error OpenBLAS: Clang didn't support OpenMP yet.)
CCOMMON_OPT += -fopenmp
endif
@@ -244,14 +319,22 @@ endif
endif
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE
#BULLDOZER PILEDRIVER
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE
#BULLDOZER PILEDRIVER
endif
endif
ifndef DYNAMIC_CORE
@@ -284,11 +367,18 @@ endif
# C Compiler dependent settings
#
ifeq ($(C_COMPILER), GCC)
# ifeq logical or. GCC or CLANG or LSB
# http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC CLANG LSB))
CCOMMON_OPT += -Wall
COMMON_PROF += -fno-inline
NO_UNINITIALIZED_WARN = -Wno-uninitialized
ifeq ($(QUIET_MAKE), 1)
CCOMMON_OPT += $(NO_UNINITIALIZED_WARN) -Wno-unused
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
@@ -373,7 +463,12 @@ endif
ifeq ($(F_COMPILER), GFORTRAN)
CCOMMON_OPT += -DF_INTERFACE_GFORT
FCOMMON_OPT += -Wall
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
ifneq ($(NO_LAPACK), 1)
ifneq ($(C_COMPILER), LSB)
EXTRALIB += -lgfortran
endif
endif
ifdef NO_BINARY_MODE
ifeq ($(ARCH), mips64)
ifdef BINARY64
@@ -480,11 +575,28 @@ ifdef INTERFACE64
FCOMMON_OPT += -i8
endif
endif
ifeq ($(ARCH), mips64)
ifndef BINARY64
FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
FCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
FCOMMON_OPT += -loongson3 -static
endif
else
ifndef BINARY64
FCOMMON_OPT += -m32
else
FCOMMON_OPT += -m64
endif
endif
ifdef USE_OPENMP
FEXTRALIB += -lstdc++
@@ -493,12 +605,30 @@ endif
endif
ifeq ($(C_COMPILER), OPEN64)
ifeq ($(ARCH), mips64)
ifndef BINARY64
CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -loongson3 -static
endif
else
ifndef BINARY64
CCOMMON_OPT += -m32
else
CCOMMON_OPT += -m64
endif
endif
endif
ifeq ($(C_COMPILER), SUN)
CCOMMON_OPT += -w
@@ -562,6 +692,10 @@ ifeq ($(NO_LAPACKE), 1)
CCOMMON_OPT += -DNO_LAPACKE
endif
ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@@ -687,11 +821,30 @@ AWK = awk
REVISION = -r$(VERSION)
MAJOR_VERSION = $(word 1,$(subst ., ,$(VERSION)))
CFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
PFLAGS = $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
ifeq ($(DEBUG), 1)
COMMON_OPT += -g
endif
FFLAGS = $(COMMON_OPT) $(FCOMMON_OPT)
FPFLAGS = $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
ifndef COMMON_OPT
COMMON_OPT = -O2
endif
override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR)
override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF)
override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT)
override FPFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) $(COMMON_PROF)
#MAKEOVERRIDES =
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
ifeq ($(C_COMPILER), LSB)
LAPACK_CFLAGS += -DLAPACK_COMPLEX_STRUCTURE
endif
ifndef SUFFIX
SUFFIX = o
@@ -705,7 +858,7 @@ ifndef LIBSUFFIX
LIBSUFFIX = a
endif
ifndef DYNAMIC_ARCH
ifneq ($(DYNAMIC_ARCH), 1)
ifndef SMP
LIBNAME = $(LIBPREFIX)_$(LIBCORE)$(REVISION).$(LIBSUFFIX)
LIBNAME_P = $(LIBPREFIX)_$(LIBCORE)$(REVISION)_p.$(LIBSUFFIX)
@@ -724,8 +877,8 @@ endif
endif
LIBDLLNAME = $(LIBPREFIX).dll
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
LIBDLLNAME = $(LIBNAME:.$(LIBSUFFIX)=.dll)
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
@@ -744,6 +897,7 @@ export CC
export FC
export BU
export FU
export NEED2UNDERSCORES
export USE_THREAD
export NUM_THREADS
export NUM_CORES
@@ -787,6 +941,13 @@ export ZGEMM_UNROLL_M
export ZGEMM_UNROLL_N
export XGEMM_UNROLL_M
export XGEMM_UNROLL_N
export CGEMM3M_UNROLL_M
export CGEMM3M_UNROLL_N
export ZGEMM3M_UNROLL_M
export ZGEMM3M_UNROLL_N
export XGEMM3M_UNROLL_M
export XGEMM3M_UNROLL_N
ifdef USE_CUDA
export CUDADIR

View File

@@ -22,19 +22,19 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif
$(SBLASOBJS) $(SBLASOBJS_P) : CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : CFLAGS += -DXDOUBLE -DCOMPLEX
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(QBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

View File

@@ -1,8 +1,5 @@
# COMPILER_PREFIX = mingw32-
ifeq ($(OSNAME), Linux)
LDFLAGS = -melf_i386
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x86

View File

@@ -2,25 +2,12 @@
ifeq ($(OSNAME), SunOS)
ifdef BINARY64
LDFLAGS = -64
ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -m64
endif
endif
endif
ifeq ($(OSNAME), FreeBSD)
LDFLAGS = -m elf_x86_64_fbsd
endif
ifeq ($(OSNAME), Linux)
LDFLAGS = -m elf_x86_64
endif
ifeq ($(OSNAME), Darwin)
LDFLAGS =
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x64
endif

View File

@@ -1,11 +1,20 @@
# OpenBLAS
[![Build Status](https://travis-ci.org/xianyi/OpenBLAS.png?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS <http://www.rdcps.ac.cn>.
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
## Installation
## Binary Packages
We provide binary packages for the following platform.
* Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
@@ -23,11 +32,15 @@ On X86 box, compile this library for loongson3a CPU.
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
### Debug version
make DEBUG=1
### Intall to the directory (Optional)
### Install to the directory (optional)
Example:
@@ -43,8 +56,10 @@ Please read GotoBLAS_01Readme.txt
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
- **AMD PILEDRIVER**: Used Bulldozer codes.
#### MIPS64:
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
@@ -54,7 +69,7 @@ Please read GotoBLAS_01Readme.txt
- **GNU/Linux**
- **MingWin/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supportted by community. We didn't test the library on this OS.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
@@ -79,7 +94,7 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro
### Set the number of threads on runtime.
We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy.
We provided the below functions to control the number of threads on runtime.
void goto_set_num_threads(int num_threads);
@@ -91,21 +106,23 @@ If you compile this lib with USE_OPENMP=1, you should use the above functions, t
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
## Contact
OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256.
* On Linux, OpenBLAS sets the processor affinity by default. This may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html). You can build the library with NO_AFFINITY=1.
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
## Specification of Git Branches
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
Now, there are 4 branches in github.com.
* The master branch. This a main branch to reflect a production-ready state.
* The develop branch. This a main branch to reflect a state with the latest delivered development changes for the next release.
* The loongson3a branch. This is a feature branch. We develop Loongson3A codes on this branch. We will merge this feature to develop branch in future.
* The gh-pages branch. This is for web pages
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.

View File

@@ -8,8 +8,8 @@ Supported List:
1.X86/X86_64
a)Intel CPU:
P2
COPPERMINE
KATMAI
COPPERMINE
NORTHWOOD
PRESCOTT
BANIAS
@@ -29,6 +29,7 @@ BARCELONA
SHANGHAI
ISTANBUL
BOBCAT
BULLDOZER
c)VIA CPU:
SSE_GENERIC

18
c_check
View File

@@ -33,6 +33,8 @@ if ($ARGV[0] =~ /(.*)(-[.\d]+)/) {
}
$compiler = "";
$compiler = LSB if ($data =~ /COMPILER_LSB/);
$compiler = CLANG if ($data =~ /COMPILER_CLANG/);
$compiler = PGI if ($data =~ /COMPILER_PGI/);
$compiler = PATHSCALE if ($data =~ /COMPILER_PATHSCALE/);
$compiler = INTEL if ($data =~ /COMPILER_INTEL/);
@@ -117,7 +119,11 @@ if ($compiler eq "OPEN64") {
$openmp = "-mp";
}
if ($compiler eq "GCC") {
if ($compiler eq "CLANG") {
$openmp = "-fopenmp";
}
if ($compiler eq "GCC" || $compiler eq "LSB") {
$openmp = "-fopenmp";
}
@@ -241,13 +247,13 @@ print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
if ($os eq "LINUX") {
@pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
# @pthread = split(/\s+/, `nm /lib/libpthread.so* | grep _pthread_create`);
if ($pthread[2] ne "") {
print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
} else {
# if ($pthread[2] ne "") {
# print CONFFILE "#define PTHREAD_CREATE_FUNC $pthread[2]\n";
# } else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}
# }
} else {
print CONFFILE "#define PTHREAD_CREATE_FUNC pthread_create\n";
}

458
cblas.h
View File

@@ -1,291 +1,303 @@
#ifndef CBLAS_H
#define CBLAS_H
#include <stddef.h>
#include "common.h"
#ifdef __cplusplus
extern "C" {
/* Assume C declarations for C++ */
#endif /* __cplusplus */
#include <stddef.h>
#include "common.h"
/*Set the number of threads on runtime.*/
void openblas_set_num_threads(int num_threads);
void goto_set_num_threads(int num_threads);
/*Get the build configure on runtime.*/
char* openblas_get_config(void);
/* Get the parallelization type which is used by OpenBLAS */
int openblas_get_parallel(void);
/* OpenBLAS is compiled for sequential use */
#define OPENBLAS_SEQUENTIAL 0
/* OpenBLAS is compiled using normal threading model */
#define OPENBLAS_THREAD 1
/* OpenBLAS is compiled using OpenMP threading model */
#define OPENBLAS_OPENMP 2
#define CBLAS_INDEX size_t
enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102};
enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114};
enum CBLAS_UPLO {CblasUpper=121, CblasLower=122};
enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132};
enum CBLAS_SIDE {CblasLeft=141, CblasRight=142};
typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy);
double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy);
float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy);
double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy);
float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy);
openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy);
openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy);
openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy);
void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret);
void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret);
void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret);
float cblas_sasum (blasint n, float *x, blasint incx);
double cblas_dasum (blasint n, double *x, blasint incx);
float cblas_scasum(blasint n, float *x, blasint incx);
double cblas_dzasum(blasint n, double *x, blasint incx);
float cblas_sasum (const blasint n, const float *x, const blasint incx);
double cblas_dasum (const blasint n, const double *x, const blasint incx);
float cblas_scasum(const blasint n, const float *x, const blasint incx);
double cblas_dzasum(const blasint n, const double *x, const blasint incx);
float cblas_snrm2 (blasint N, float *X, blasint incX);
double cblas_dnrm2 (blasint N, double *X, blasint incX);
float cblas_scnrm2(blasint N, float *X, blasint incX);
double cblas_dznrm2(blasint N, double *X, blasint incX);
float cblas_snrm2 (const blasint N, const float *X, const blasint incX);
double cblas_dnrm2 (const blasint N, const double *X, const blasint incX);
float cblas_scnrm2(const blasint N, const float *X, const blasint incX);
double cblas_dznrm2(const blasint N, const double *X, const blasint incX);
CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx);
CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx);
CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx);
CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx);
CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx);
void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy);
void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy);
void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy);
void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy);
void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy);
void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy);
void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy);
void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy);
void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy);
void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy);
void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s);
void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s);
void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s);
void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P);
void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P);
void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P);
void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P);
void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P);
void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P);
void cblas_sscal(blasint N, float alpha, float *X, blasint incX);
void cblas_dscal(blasint N, double alpha, double *X, blasint incX);
void cblas_cscal(blasint N, float *alpha, float *X, blasint incX);
void cblas_zscal(blasint N, double *alpha, double *X, blasint incX);
void cblas_csscal(blasint N, float alpha, float *X, blasint incX);
void cblas_zdscal(blasint N, double alpha, double *X, blasint incX);
void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX);
void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX);
void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX);
void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX);
void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy);
void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy);
void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy);
void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n,
double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy);
void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy);
void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy);
void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy);
void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n,
const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy);
void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX);
void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX);
void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda);
void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda);
void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda);
void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda);
void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X,
blasint incX, float *Y, blasint incY, float *A, blasint lda);
void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,
blasint incX, double *Y, blasint incY, double *A, blasint lda);
void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX,
float *Y, blasint incY, float *A, blasint lda);
void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX,
double *Y, blasint incY, double *A, blasint lda);
void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X,
const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,
const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX,
const float *Y, const blasint incY, float *A, const blasint lda);
void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX,
const double *Y, const blasint incY, double *A, const blasint lda);
void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N,
blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N,
const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A,
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A,
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, float *A, blasint lda, float *X, blasint incX);
void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, blasint K, double *A, blasint lda, double *X, blasint incX);
void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX);
void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX);
void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, float *Ap, float *X, blasint incX);
void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint N, double *Ap, double *X, blasint incX);
void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const float *Ap, float *X, const blasint incX);
void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag,
const blasint N, const double *Ap, double *X, const blasint incX);
void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A,
blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A,
blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A,
blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A,
blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A,
const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A,
const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A,
const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A,
const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap,
float *X, blasint incX, float beta, float *Y, blasint incY);
void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap,
double *X, blasint incX, double beta, double *Y, blasint incY);
void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap,
const float *X, const blasint incX, const float beta, float *Y, const blasint incY);
void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap,
const double *X, const blasint incX, const double beta, double *Y, const blasint incY);
void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap);
void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap);
void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap);
void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap);
void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A);
void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A);
void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A);
void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A);
void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A);
void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A);
void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap);
void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap);
void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A);
void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A);
void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap);
void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap);
void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K,
const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY);
void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N,
double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY);
void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY);
void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N,
const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY);
void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K,
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc);
void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc);
void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc);
void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc);
void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans,
blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans,
const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb);
void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA,
enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb);
void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb);
void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA,
const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb);
void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc);
void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc);
void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc);
void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N,
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc);
void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float alpha, float *A, blasint lda, float beta, float *C, blasint ldc);
void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double alpha, double *A, blasint lda, double beta, double *C, blasint ldc);
void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc);
void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc);
void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc);
void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K,
double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc);
void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc);
void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K,
const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc);
void cblas_xerbla(blasint p, char *rout, char *form, ...);
#ifdef __cplusplus
}
#endif /* __cplusplus */
#endif

View File

@@ -314,6 +314,23 @@ typedef int blasint;
#define YIELDING sched_yield()
#endif
/***
To alloc job_t on heap or statck.
please https://github.com/xianyi/OpenBLAS/issues/246
***/
#if defined(OS_WINDOWS)
#define GETRF_MEM_ALLOC_THRESHOLD 32
#define BLAS3_MEM_ALLOC_THRESHOLD 32
#endif
#ifndef GETRF_MEM_ALLOC_THRESHOLD
#define GETRF_MEM_ALLOC_THRESHOLD 80
#endif
#ifndef BLAS3_MEM_ALLOC_THRESHOLD
#define BLAS3_MEM_ALLOC_THRESHOLD 160
#endif
#ifdef QUAD_PRECISION
#include "common_quad.h"
#endif
@@ -351,7 +368,12 @@ typedef int blasint;
#endif
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
#ifdef __NetBSD__
#define MMAP_POLICY (MAP_PRIVATE | MAP_ANON)
#else
#define MMAP_POLICY (MAP_PRIVATE | MAP_ANONYMOUS)
#endif
#include "param.h"
#include "common_param.h"
@@ -385,14 +407,17 @@ typedef int blasint;
/* C99 supports complex floating numbers natively, which GCC also offers as an
extension since version 3.0. If neither are available, use a compatible
structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3
#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
(__GNUC__ >= 3 && !defined(__cplusplus)))
#define OPENBLAS_COMPLEX_C99
typedef float _Complex openblas_complex_float;
typedef double _Complex openblas_complex_double;
typedef xdouble _Complex openblas_complex_xdouble;
#else
#define OPENBLAS_COMPLEX_STRUCT
typedef struct { float real, imag; } openblas_complex_float;
typedef struct { double real, imag; } openblas_complex_double;
typedef struct { xdouble real, imag; } openblas_complex_xdouble;
#endif
#endif // ASSEMBLER
@@ -550,7 +575,8 @@ typedef struct {
#include "common_level3.h"
#include "common_lapack.h"
#ifdef CBLAS
#include "cblas.h"
/* This header file is generated from "cblas.h" (see Makefile.prebuild). */
#include "cblas_noconst.h"
#endif
#ifndef ASSEMBLER

View File

@@ -150,9 +150,17 @@ REALNAME:
#define PROFCODE .prologue 0
#endif
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif
#define EPILOGUE \
.end REALNAME; \
.ident VERSION
.ident VERSION; \
GNUSTACK
#endif
#ifdef DOUBLE

View File

@@ -379,8 +379,15 @@ REALNAME:
#define PROFCODE
#endif
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif
#define EPILOGUE \
.endp REALNAME
.endp REALNAME ; \
GNUSTACK
#define START_ADDRESS 0x20000fc800000000UL

View File

@@ -45,7 +45,7 @@ extern "C" {
int BLASFUNC(xerbla)(char *, blasint *info, blasint);
void BLASFUNC(openblas_set_num_threads)(int *);
void openblas_set_num_threads_(int *);
FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *);
FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *);
@@ -76,19 +76,19 @@ myxcomplex_t BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *,
myxcomplex_t BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
#elif defined RETURN_BY_STACK
void BLASFUNC(cdotu) (float _Complex *, blasint *, float * , blasint *, float *, blasint *);
void BLASFUNC(cdotc) (float _Complex *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zdotu) (double _Complex *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(zdotc) (double _Complex *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xdotu) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(xdotc) (xdouble _Complex *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(cdotu) (openblas_complex_float *, blasint *, float * , blasint *, float *, blasint *);
void BLASFUNC(cdotc) (openblas_complex_float *, blasint *, float *, blasint *, float *, blasint *);
void BLASFUNC(zdotu) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(zdotc) (openblas_complex_double *, blasint *, double *, blasint *, double *, blasint *);
void BLASFUNC(xdotu) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(xdotc) (openblas_complex_xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
#else
float _Complex BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *);
float _Complex BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *);
double _Complex BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *);
double _Complex BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *);
xdouble _Complex BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
xdouble _Complex BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
openblas_complex_float BLASFUNC(cdotu) (blasint *, float *, blasint *, float *, blasint *);
openblas_complex_float BLASFUNC(cdotc) (blasint *, float *, blasint *, float *, blasint *);
openblas_complex_double BLASFUNC(zdotu) (blasint *, double *, blasint *, double *, blasint *);
openblas_complex_double BLASFUNC(zdotc) (blasint *, double *, blasint *, double *, blasint *);
openblas_complex_xdouble BLASFUNC(xdotu) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
openblas_complex_xdouble BLASFUNC(xdotc) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
#endif
void BLASFUNC(saxpy) (blasint *, float *, float *, blasint *, float *, blasint *);
@@ -642,6 +642,8 @@ int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
/* Lapack routines */
int BLASFUNC(sgetf2)(blasint *, blasint *, float *, blasint *, blasint *, blasint *);
int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
@@ -677,6 +679,13 @@ int BLASFUNC(cgesv)(blasint *, blasint *, float *, blasint *, blasint *, float
int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
int BLASFUNC(sgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cgesvd)(char *, char *, blasint *, blasint *, float *, blasint *, float *, float *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zgesvd)(char *, char *, blasint *, blasint *, double *, blasint *, double *, double *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xgesvd)(char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotf2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
@@ -691,6 +700,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(slauu2)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);

View File

@@ -65,9 +65,16 @@ extern long int syscall (long int __sysno, ...);
#endif
#endif
static inline int my_mbind(void *addr, unsigned long len, int mode,
unsigned long *nodemask, unsigned long maxnode,
unsigned flags) {
#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
@@ -79,11 +86,17 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
// unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif
#endif
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
#if defined (__LSB_VERSION__)
// So far, LSB (Linux Standard Base) don't support syscall().
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
return syscall(SYS_set_mempolicy, mode, addr, flag);
#endif
}
static inline int my_gettid(void) {

View File

@@ -235,10 +235,17 @@ REALNAME: ;\
.set noreorder ;\
.set nomacro
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif
#define EPILOGUE \
.set macro ;\
.set reorder ;\
.end REALNAME
.end REALNAME ;\
GNUSTACK
#define PROFCODE
#endif
@@ -255,8 +262,8 @@ REALNAME: ;\
#endif
#if defined(LOONGSON3B)
#define PAGESIZE (32UL << 10)
#define FIXED_PAGESIZE (32UL << 10)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif
#ifndef PAGESIZE

View File

@@ -199,8 +199,17 @@ static __inline int blas_quickdivide(blasint x, blasint y){
.type REALNAME, #function; \
.proc 07; \
REALNAME:;
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",%progbits
#else
#define GNUSTACK
#endif
#define EPILOGUE \
.size REALNAME, .-REALNAME
.size REALNAME, .-REALNAME; \
GNUSTACK
#endif
#endif

View File

@@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#define MMXSTORE movd
#endif
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
@@ -296,7 +301,9 @@ REALNAME:
#define PROFCODE
#endif
#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits
#endif
@@ -335,6 +342,7 @@ REALNAME:
#define ALIGN_2 .align 2
#define ALIGN_3 .align 3
#define ALIGN_4 .align 4
#define ALIGN_5 .align 5
#define ffreep fstp
#endif
@@ -356,11 +364,10 @@ REALNAME:
#ifndef ALIGN_6
#define ALIGN_6 .align 64
#endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif

View File

@@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
#ifdef ASSEMBLER
#if defined(PILEDRIVER) || defined(BULLDOZER)
//Enable some optimazation for barcelona.
#define BARCELONA_OPTIMIZATION
#endif
#if defined(HAVE_3DNOW)
#define EMMS femms
#elif defined(HAVE_MMX)
@@ -367,7 +372,10 @@ REALNAME:
#define PROFCODE
#endif
#define EPILOGUE .size REALNAME, .-REALNAME
#define EPILOGUE \
.size REALNAME, .-REALNAME; \
.section .note.GNU-stack,"",%progbits
#endif

14
cpuid.h
View File

@@ -105,7 +105,9 @@
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER 22
#define CORE_BULLDOZER CORE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#define CORE_HASWELL CORE_SANDYBRIDGE
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -125,7 +127,9 @@
#define HAVE_MISALIGNSSE (1 << 15)
#define HAVE_128BITFPU (1 << 16)
#define HAVE_FASTMOVU (1 << 17)
#define HAVE_AVX (1 << 18)
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@@ -194,5 +198,9 @@ typedef struct {
#define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
#endif

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -101,12 +101,14 @@ int detect(void){
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
@@ -119,6 +121,24 @@ int detect(void){
}else{
return CPU_SICORTEX;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif
return CPU_UNKNOWN;
}

View File

@@ -114,6 +114,7 @@ int detect(void){
if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970;
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;

View File

@@ -40,6 +40,17 @@
#include <string.h>
#include "cpuid.h"
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CORE_BULLDOZER CORE_BARCELONA
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif
#ifndef CPUIDEMU
#if defined(__APPLE__) && defined(__i386__)
@@ -109,6 +120,33 @@ static inline int have_excpuid(void){
return eax & 0xffff;
}
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
int get_vendor(void){
int eax, ebx, ecx, edx;
char vendor[13];
@@ -189,12 +227,18 @@ int get_cputype(int gettype){
if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3;
if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1;
if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2;
if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX;
#ifndef NO_AVX
if (support_avx()) feature |= HAVE_AVX;
if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3;
#endif
if (have_excpuid() >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A;
if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE;
#ifndef NO_AVX
if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4;
#endif
if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX;
if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW;
}
@@ -984,13 +1028,21 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CPUTYPE_SANDYBRIDGE;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM; //OS doesn't support AVX
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CPUTYPE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CPUTYPE_SANDYBRIDGE;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14:
// Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CPUTYPE_NEHALEM;
@@ -999,9 +1051,26 @@ int get_cpuname(void){
case 3:
switch (model) {
case 10:
return CPUTYPE_SANDYBRIDGE;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 12:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
}
break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CPUTYPE_HASWELL;
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@@ -1033,8 +1102,22 @@ int get_cpuname(void){
return CPUTYPE_OPTERON;
case 1:
case 10:
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
return CPUTYPE_BARCELONA;
case 6:
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CPUTYPE_BULLDOZER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CPUTYPE_PILEDRIVER;
else
return CPUTYPE_BARCELONA; //OS don't support AVX.
}
break;
case 5:
return CPUTYPE_BOBCAT;
}
@@ -1159,6 +1242,7 @@ static char *cpuname[] = {
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
};
static char *lowercpuname[] = {
@@ -1208,6 +1292,7 @@ static char *lowercpuname[] = {
"sandybridge",
"bobcat",
"bulldozer",
"piledriver",
};
static char *corename[] = {
@@ -1234,6 +1319,7 @@ static char *corename[] = {
"SANDYBRIDGE",
"BOBCAT",
"BULLDOZER",
"PILEDRIVER",
};
static char *corename_lower[] = {
@@ -1260,6 +1346,7 @@ static char *corename_lower[] = {
"sandybridge",
"bobcat",
"bulldozer",
"piledriver",
};
@@ -1343,13 +1430,21 @@ int get_coretype(void){
return CORE_NEHALEM;
case 10:
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
return CORE_SANDYBRIDGE;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 12:
//Xeon Processor 5600 (Westmere-EP)
return CORE_NEHALEM;
case 13:
//Intel Core i7-3000 / Xeon E5 (Sandy Bridge)
return CORE_SANDYBRIDGE;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 14:
//Xeon E7540
case 15:
//Xeon Processor E7 (Westmere-EX)
return CORE_NEHALEM;
@@ -1358,9 +1453,26 @@ int get_coretype(void){
case 3:
switch (model) {
case 10:
return CORE_SANDYBRIDGE;
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM; //OS doesn't support AVX
case 12:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
}
break;
case 4:
switch (model) {
case 5:
if(support_avx())
return CORE_HASWELL;
else
return CORE_NEHALEM;
}
break;
}
break;
@@ -1376,8 +1488,21 @@ int get_coretype(void){
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
else return CORE_BARCELONA;
else if (exfamily == 6) {
switch (model) {
case 1:
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return CORE_BULLDOZER;
else
return CORE_BARCELONA; //OS don't support AVX.
case 2:
if(support_avx())
return CORE_PILEDRIVER;
else
return CORE_BARCELONA; //OS don't support AVX.
}
}else return CORE_BARCELONA;
}
}
@@ -1443,6 +1568,9 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE %d\n", info.size * 1024);
printf("#define DTB_ASSOCIATIVE %d\n", info.associative);
printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize);
} else {
//fall back for some virtual machines.
printf("#define DTB_DEFAULT_ENTRIES 32\n");
}
features = get_cputype(GET_FEATURE);
@@ -1460,6 +1588,8 @@ void get_cpuconfig(void){
if (features & HAVE_AVX ) printf("#define HAVE_AVX\n");
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
@@ -1526,5 +1656,7 @@ void get_sse(void){
if (features & HAVE_AVX ) printf("HAVE_AVX=1\n");
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
}

14
ctest.c
View File

@@ -1,3 +1,17 @@
//LSB (Linux Standard Base) compiler
//only support lsbc++
#if defined (__LSB_VERSION__)
#if !defined (__cplusplus)
COMPILER_LSB
#else
#error "OpenBLAS only supports lsbcc."
#endif
#endif
#if defined(__clang__)
COMPILER_CLANG
#endif
#if defined(__PGI) || defined(__PGIC__)
COMPILER_PGI
#endif

View File

@@ -5,7 +5,7 @@
TOPDIR = ..
include $(TOPDIR)/Makefile.system
CFLAGS += -DADD$(BU) -DCBLAS
override CFLAGS += -DADD$(BU) -DCBLAS
LIB = $(TOPDIR)/$(LIBNAME)
@@ -77,7 +77,7 @@ endif
clean ::
rm -f x*
FLDFLAGS = $(FFLAGS:-fPIC=)
FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
CEXTRALIB =
# Single real

View File

@@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a = (FLOAT *)args -> a;
x = (FLOAT *)args -> b;
y = (FLOAT *)args -> c;
lda = args -> lda;
incx = args -> ldb;
@@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
n_from = 0;
n_to = n;
//Use y as each thread's n* COMPSIZE elements in sb buffer
y = buffer;
buffer += ((COMPSIZE * n + 1023) & ~1023);
if (range_m) {
n_from = *(range_m + 0);
n_to = *(range_m + 1);
@@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
a += n_from * lda * COMPSIZE;
}
if (range_n) y += *range_n * COMPSIZE;
if (incx != 1) {
COPY_K(n, x, incx, buffer, 1);
@@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer;
queue[num_cpu - 1].next = NULL;
exec_blas(num_cpu, queue);
@@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x
#else
ONE, ZERO,
#endif
buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0);
(FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0);
}
AXPYU_K(n, 0, 0,

View File

@@ -71,7 +71,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
queue[num_cpu].args = arg;
queue[num_cpu].range_m = range_m;
queue[num_cpu].range_n = &range[num_cpu];
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
#else
@@ -83,7 +83,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
}
if (num_cpu) {
#if defined(LOONGSON3A)
#if 0 //defined(LOONGSON3A)
queue[0].sa = sa;
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
#else

View File

@@ -332,7 +332,20 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#else
for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();

View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
#ifndef GEMM3M_LOCAL
#if defined(NN)
#define GEMM3M_LOCAL GEMM3M_NN
@@ -836,7 +842,11 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
BLASLONG range_M[MAX_CPU_NUMBER + 1];
BLASLONG range_N[MAX_CPU_NUMBER + 1];
job_t job[MAX_CPU_NUMBER];
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
BLASLONG num_cpu_m, num_cpu_n;
@@ -866,6 +876,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
if (!range_m) {
@@ -945,6 +964,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
#ifndef SYRK_LOCAL
#if !defined(LOWER) && !defined(TRANS)
#define SYRK_LOCAL SYRK_UN
@@ -502,7 +508,12 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 100];
@@ -556,6 +567,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
newarg.ldc = args -> ldc;
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
if (!range_n) {
@@ -668,6 +688,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
exec_blas(num_cpu, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@@ -48,6 +48,12 @@
#define SWITCH_RATIO 2
#endif
//The array of job_t may overflow the stack.
//Instead, use malloc to alloc job_t.
#if MAX_CPU_NUMBER > BLAS3_MEM_ALLOC_THRESHOLD
#define USE_ALLOC_HEAP
#endif
#ifndef GEMM_LOCAL
#if defined(NN)
#define GEMM_LOCAL GEMM_NN
@@ -360,8 +366,20 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
min_jj = MIN(n_to, xxx + div_n) - jjs;
#if defined(BULLDOZER) && defined(ARCH_X86_64) && !defined(XDOUBLE) && !defined(COMPLEX)
if (min_jj >= 12*GEMM_UNROLL_N) min_jj = 12*GEMM_UNROLL_N;
else
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
else
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif
START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
@@ -519,7 +537,12 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
blas_arg_t newarg;
#ifndef USE_ALLOC_HEAP
job_t job[MAX_CPU_NUMBER];
#else
job_t * job = NULL;
#endif
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_M[MAX_CPU_NUMBER + 1];
@@ -563,6 +586,15 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
newarg.alpha = args -> alpha;
newarg.beta = args -> beta;
newarg.nthreads = args -> nthreads;
#ifdef USE_ALLOC_HEAP
job = (job_t*)malloc(MAX_CPU_NUMBER * sizeof(job_t));
if(job==NULL){
fprintf(stderr, "OpenBLAS: malloc failed in %s\n", __func__);
exit(1);
}
#endif
newarg.common = (void *)job;
#ifdef PARAMTEST
@@ -634,7 +666,7 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
num_cpu_n ++;
}
for (j = 0; j < num_cpu_m; j++) {
for (i = 0; i < num_cpu_m; i++) {
for (k = 0; k < DIVIDE_RATE; k++) {
@@ -648,6 +680,10 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
exec_blas(num_cpu_m, queue);
}
#ifdef USE_ALLOC_HEAP
free(job);
#endif
return 0;
}

View File

@@ -1,7 +1,7 @@
TOPDIR = ../..
include ../../Makefile.system
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX)
COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX)
COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX)
@@ -14,7 +14,7 @@ endif
# COMMONOBJS += info.$(SUFFIX)
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
COMMONOBJS += dynamic.$(SUFFIX)
else
COMMONOBJS += parameter.$(SUFFIX)
@@ -70,7 +70,7 @@ ifndef BLAS_SERVER
BLAS_SERVER = blas_server.c
endif
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
@@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../.
openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_config.$(SUFFIX) : openblas_get_config.c
$(CC) $(CFLAGS) -c $< -o $(@F)
openblas_get_parallel.$(SUFFIX) : openblas_get_parallel.c
$(CC) $(CFLAGS) -c $< -o $(@F)
blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h
$(CC) $(CFLAGS) -c $< -o $(@F)
@@ -215,7 +221,7 @@ info.$(SUFFIX) : info.c info.h ../../common.h ../../param.h
$(CC) $(CFLAGS) -c $< -o $(@F)
hpl : CFLAGS += -DHPL
hpl_p : CFLAGS += -DHPL
hpl : override CFLAGS += -DHPL
hpl_p : override CFLAGS += -DHPL
include $(TOPDIR)/Makefile.tail

View File

@@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
#ifdef MONITOR
@@ -435,7 +436,7 @@ static int blas_thread_server(void *arg){
blas_memory_free(buffer);
pthread_exit(NULL);
//pthread_exit(NULL);
return 0;
}
@@ -770,6 +771,19 @@ void goto_set_num_threads(int num_threads) {
if (num_threads < 1) num_threads = blas_num_threads;
#ifndef NO_AFFINITY
if (num_threads == 1) {
if (blas_cpu_number == 1){
//OpenBLAS is already single thread.
return;
}else{
//From multi-threads to single thread
//Restore the original affinity mask
gotoblas_set_affinity(-1);
}
}
#endif
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
@@ -800,6 +814,13 @@ void goto_set_num_threads(int num_threads) {
UNLOCK_COMMAND(&server_lock);
}
#ifndef NO_AFFINITY
if(blas_cpu_number == 1 && num_threads > 1){
//Restore the thread 0 affinity.
gotoblas_set_affinity(0);
}
#endif
blas_cpu_number = num_threads;
#if defined(ARCH_MIPS64)

View File

@@ -49,8 +49,12 @@
int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
void goto_set_num_threads(int num_threads) {
int i=0;
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;
omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
blas_set_parameter();
@@ -76,17 +92,33 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){
int i=0;
blas_get_cpu_number();
blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
}
return 0;
}
int BLASFUNC(blas_thread_shutdown)(void){
int i=0;
blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
return 0;
}
@@ -177,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
static void exec_threads(blas_queue_t *queue){
void *buffer, *sa, *sb;
int pos=0, release_flag=0;
buffer = NULL;
sa = queue -> sa;
sb = queue -> sb;
@@ -189,9 +222,19 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
buffer = blas_memory_alloc(2);
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
//fallback
if(buffer==NULL) {
buffer = blas_memory_alloc(2);
release_flag=1;
}
if (sa == NULL) {
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
queue->sa=sa;
}
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
@@ -224,6 +267,7 @@ static void exec_threads(blas_queue_t *queue){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
}
@@ -241,7 +285,7 @@ static void exec_threads(blas_queue_t *queue){
}
if (buffer != NULL) blas_memory_free(buffer);
if (release_flag) blas_memory_free(buffer);
}

View File

@@ -63,13 +63,7 @@ static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
void goto_set_num_threads(int num)
{
}
void openblas_set_num_threads(int num)
{
}
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
@@ -187,7 +181,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action == WAIT_OBJECT_0 + 1));
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break;
@@ -259,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
#ifdef MONITOR
@@ -271,7 +266,9 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else {
legacy_exec(routine, queue -> mode, queue -> args, sb);
}
}
}else{
continue; //if queue == NULL
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
@@ -433,7 +430,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
/* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */
int blas_thread_shutdown_(void){
int BLASFUNC(blas_thread_shutdown)(void){
int i;
@@ -445,7 +442,7 @@ int blas_thread_shutdown_(void){
SetEvent(pool.killed);
for(i = 0; i < blas_cpu_number - 1; i++){
for(i = 0; i < blas_num_threads - 1; i++){
WaitForSingleObject(blas_threads[i], INFINITE);
}
@@ -456,3 +453,47 @@ int blas_thread_shutdown_(void){
return 0;
}
void goto_set_num_threads(int num_threads)
{
long i;
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

View File

@@ -60,6 +60,21 @@ extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
//extern gotoblas_t gotoblas_BULLDOZER;
//extern gotoblas_t gotoblas_PILEDRIVER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#endif
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
@@ -68,6 +83,32 @@ extern gotoblas_t gotoblas_BARCELONA;
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
static int get_vendor(void){
int eax, ebx, ecx, edx;
char vendor[13];
@@ -122,15 +163,59 @@ static gotoblas_t *get_coretype(void){
if (model == 12) return &gotoblas_ATOM;
return NULL;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP)
if (model == 12) return &gotoblas_NEHALEM;
return NULL;
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
//Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 4:
//Intel Haswell
if (model == 5) {
if(support_avx())
return &gotoblas_HASWELL;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD;
@@ -144,7 +229,27 @@ static gotoblas_t *get_coretype(void){
if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else {
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 2){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else {
return &gotoblas_BARCELONA;
}
}
@@ -178,6 +283,10 @@ static char *corename[] = {
"Opteron(SSE3)",
"Barcelona",
"Nano",
"Sandybridge",
"Bobcat",
"Bulldozer",
"Piledriver",
};
char *gotoblas_corename(void) {
@@ -197,7 +306,11 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
return corename[0];
}
@@ -211,12 +324,21 @@ void gotoblas_dynamic_init(void) {
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||
gotoblas == &gotoblas_COPPERMINE ||
gotoblas == &gotoblas_NORTHWOOD ||
gotoblas == &gotoblas_BANIAS ||
gotoblas == &gotoblas_ATHLON)
gotoblas = &gotoblas_PRESCOTT;
}
#endif
if (gotoblas && gotoblas -> init) {
gotoblas -> init();
} else {
fprintf(stderr, "GotoBLAS : Architecture Initialization failed. No initialization function found.\n");
fprintf(stderr, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}

View File

@@ -82,6 +82,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sched.h>
#include <dirent.h>
#include <dlfcn.h>
#include <unistd.h>
#include <string.h>
#define MAX_NODES 16
#define MAX_CPUS 256
@@ -314,7 +316,7 @@ static int numa_check(void) {
}
while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
if (strncmp(dir->d_name, "node", 4)==0) {
node = atoi(&dir -> d_name[4]);
@@ -735,7 +737,8 @@ void gotoblas_affinity_init(void) {
fprintf(stderr, "Shared Memory Initialization.\n");
#endif
common -> num_procs = get_nprocs();
//returns the number of processors which are currently online
common -> num_procs = sysconf(_SC_NPROCESSORS_ONLN);;
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);

View File

@@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -105,6 +105,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
@@ -125,7 +126,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define NO_WARMUP
#endif
#ifdef ALLOC_HUGETLB
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
@@ -185,7 +186,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD)
int get_num_procs(void) {
@@ -206,7 +207,46 @@ int get_num_procs(void) {
#endif
#if defined(OS_DARWIN)
int get_num_procs(void) {
static int nums = 0;
size_t len;
if (nums == 0){
len = sizeof(int);
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
}
return nums;
}
/*
void set_stack_limit(int limitMB){
int result=0;
struct rlimit rl;
rlim_t StackSize;
StackSize=limitMB*1024*1024;
result=getrlimit(RLIMIT_STACK, &rl);
if(result==0){
if(rl.rlim_cur < StackSize){
rl.rlim_cur=StackSize;
result=setrlimit(RLIMIT_STACK, &rl);
if(result !=0){
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
}
}
}
}
*/
#endif
/*
OpenBLAS uses the numbers of CPU cores in multithreading.
It can be set by openblas_set_num_threads(int num_threads);
*/
int blas_cpu_number = 0;
/*
The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int goto_get_num_procs (void) {
@@ -1228,6 +1268,7 @@ void CONSTRUCTOR gotoblas_init(void) {
if (gotoblas_initialized) return;
#ifdef PROFILE
moncontrol (0);
#endif
@@ -1289,6 +1330,7 @@ void DESTRUCTOR gotoblas_quit(void) {
moncontrol (1);
#endif
blas_shutdown();
}
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))

View File

@@ -0,0 +1,59 @@
/*****************************************************************************
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
static char* openblas_config_str=""
#ifdef USE64BITINT
"USE64BITINT "
#endif
#ifdef NO_CBLAS
"NO_CBLAS "
#endif
#ifdef NO_LAPACK
"NO_LAPACK "
#endif
#ifdef NO_LAPACKE
"NO_LAPACKE "
#endif
#ifdef DYNAMIC_ARCH
"DYNAMIC_ARCH "
#endif
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
;
char* CNAME() {
return openblas_config_str;
}

View File

@@ -0,0 +1,52 @@
/*****************************************************************************
Copyright (c) 2013 Martin Koehler, grisuthedragon@users.github.com
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the ISCAS nor the names of its contributors may
be used to endorse or promote products derived from this software
without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
#if defined(USE_OPENMP)
static int parallel = 2 ;
#elif defined(SMP_SERVER)
static int parallel = 1;
#else
static int parallel = 0;
#endif
int CNAME() {
return parallel;
}
int NAME() {
return parallel;
}

View File

@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern void openblas_set_num_threads(int num_threads) ;
void NAME(int* num_threads){
void openblas_set_num_threads_(int* num_threads){
openblas_set_num_threads(*num_threads);
}
@@ -46,7 +46,7 @@ void NAME(int* num_threads){
void openblas_set_num_threads(int num_threads) {
}
void NAME(int* num_threads){
void openblas_set_num_threads_(int* num_threads){
}
#endif

View File

@@ -163,7 +163,7 @@ int get_L2_size(void){
int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)

View File

@@ -18,10 +18,19 @@ ifndef NO_LAPACKE
NO_LAPACKE = 0
endif
ifndef NEED2UNDERSCORES
NEED2UNDERSCORES=0
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(F_COMPILER), GFORTRAN)
EXTRALIB += -lgfortran
endif
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), GCC)
EXTRALIB += -lgomp
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
@@ -66,6 +75,11 @@ dll : ../$(LIBDLLNAME)
dll2 : libgoto2_shared.dll
# On Windows, we only generate a DLL without a version suffix. This is because
# applications which link against the dynamic library reference a fixed DLL name
# in their import table. By instead using a stable name it is possible to
# upgrade between library versions, without needing to re-link an application.
# For more details see: https://github.com/xianyi/OpenBLAS/issues/127.
../$(LIBDLLNAME) : ../$(LIBNAME) libopenblas.def dllinit.$(SUFFIX)
$(RANLIB) ../$(LIBNAME)
ifeq ($(BINARY32), 1)
@@ -79,18 +93,18 @@ else
endif
libgoto2_shared.dll : ../$(LIBNAME) libgoto2_shared.def
$(CC) $(CFLAGS) libgoto2_shared.def -shared -o $(@F) \
$(CC) $(CFLAGS) $(LDFLAGS) libgoto2_shared.def -shared -o $(@F) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--out-implib,libgoto2_shared.lib $(FEXTRALIB)
libopenblas.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
libgoto2_shared.def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
$(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
@@ -106,23 +120,29 @@ ifeq ($(OSNAME), Linux)
so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def -Wl,-soname,$(LIBPREFIX).so.$(MAJOR_VERSION) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else
#Use FC on LSB
$(FC) $(FFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
rm -f linktest
endif
ifeq ($(OSNAME), FreeBSD)
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
so : ../$(LIBSONAME)
../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \
-Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest
endif
@@ -132,15 +152,15 @@ ifeq ($(OSNAME), OSF1)
so : ../$(LIBSONAME)
../$(LIBSONAME) :
$(CC) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) ../$(LIBNAME)
endif
ifeq ($(OSNAME), SunOS)
so : ../$(LIBSONAME)
$(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(EXTRALIB)
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
rm -f linktest
endif
@@ -171,23 +191,23 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX)
linux.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol linux $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > $(@F)
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > $(@F)
symbol.S : gensymbol
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > symbol.S
perl ./gensymbol win2kasm noarch dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > symbol.S
test : linktest.c
$(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) > linktest.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) > linktest.c
clean ::
@rm -f *.def *.dylib __.SYMDEF*

View File

@@ -49,7 +49,7 @@
cblas_zhemv, cblas_zher2, cblas_zher2k, cblas_zher, cblas_zherk, cblas_zhpmv, cblas_zhpr2,
cblas_zhpr, cblas_zscal, cblas_zswap, cblas_zsymm, cblas_zsyr2k, cblas_zsyrk,
cblas_ztbmv, cblas_ztbsv, cblas_ztpmv, cblas_ztpsv, cblas_ztrmm, cblas_ztrmv, cblas_ztrsm,
cblas_ztrsv);
cblas_ztrsv, cblas_cdotc_sub, cblas_cdotu_sub, cblas_zdotc_sub, cblas_zdotu_sub );
@exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@@ -72,12 +72,18 @@
zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m,
);
#both underscore and no underscore
@misc_common_objs = (
openblas_set_num_threads, openblas_get_parallel,
);
@misc_no_underscore_objs = (
openblas_set_num_threads, goto_set_num_threads,
goto_set_num_threads,
openblas_get_config,
);
@misc_underscore_objs = (
openblas_set_num_threads,
);
@lapackobjs = (
@@ -108,9 +114,9 @@
# ALLAUX -- Auxiliary routines called from all precisions
# already provided by @blasobjs: xerbla, lsame
ilaenv, ieeeck, lsamen, xerbla_array, iparmq,
ilaprec, ilatrans, ilauplo, iladiag, chla_transtype,
ilaver, slamch,
ilaenv, ieeeck, lsamen, iparmq,
ilaprec, ilatrans, ilauplo, iladiag,
ilaver, slamch, slamc3,
# SCLAUX -- Auxiliary routines called from both REAL and COMPLEX.
# excluded: second_$(TIMER)
@@ -147,7 +153,7 @@
dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc,
dsteqr, dsterf, dlaisnan, disnan,
dlartgp, dlartgs,
dlamch,
dlamch, dlamc3,
# SLASRC -- Single precision real LAPACK routines
# already provided by @lapackobjs:
@@ -2666,18 +2672,32 @@
#LAPACKE_zlagsy_work,
);
#These function may need 2 underscores.
@lapack_embeded_underscore_objs=(xerbla_array, chla_transtype,);
if ($ARGV[5] == 1) {
#NO_LAPACK=1
@underscore_objs = (@blasobjs, @misc_underscore_objs);
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") {
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
} elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1" ||
-d "../lapack-3.4.2" || -d "../lapack-netlib") {
if ($ARGV[7] == 0){
# NEED2UNDERSCORES=0
# Don't need 2 underscores
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs, @lapack_embeded_underscore_objs);
}else{
# Need 2 underscores
@underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs);
@need_2underscore_objs = (@lapack_embeded_underscore_objs);
};
} else {
@underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs);
}
if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); };
if ($ARGV[1] eq "X86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
if ($ARGV[1] eq "x86_64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
if ($ARGV[1] eq "x86"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); };
@@ -2714,10 +2734,18 @@ $bu = $ARGV[2];
$bu = "" if (($bu eq "0") || ($bu eq "1"));
if ($ARGV[0] eq "linux"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
foreach $objs (@underscore_objs) {
print $objs, $bu, "\n";
}
foreach $objs (@need_2underscore_objs) {
print $objs, $bu, $bu, "\n";
}
# if ($ARGV[4] == 0) {
foreach $objs (@no_underscore_objs) {
print $objs, "\n";
@@ -2731,10 +2759,18 @@ if ($ARGV[0] eq "linux"){
}
if ($ARGV[0] eq "osx"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
foreach $objs (@underscore_objs) {
print "_", $objs, $bu, "\n";
}
foreach $objs (@need_2underscore_objs) {
print "_", $objs, $bu, $bu, "\n";
}
# if ($ARGV[4] == 0) {
foreach $objs (@no_underscore_objs) {
print "_", $objs, "\n";
@@ -2744,10 +2780,18 @@ if ($ARGV[0] eq "osx"){
}
if ($ARGV[0] eq "aix"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
foreach $objs (@underscore_objs) {
print $objs, $bu, "\n";
}
foreach $objs (@need_2underscore_objs) {
print $objs, $bu, $bu, "\n";
}
# if ($ARGV[4] == 0) {
foreach $objs (@no_underscore_objs) {
print $objs, "\n";
@@ -2759,23 +2803,42 @@ if ($ARGV[0] eq "aix"){
if ($ARGV[0] eq "win2k"){
print "EXPORTS\n";
$count = 1;
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
foreach $objs (@underscore_objs) {
unless ($objs =~ /openblas_set_num_threads/) { #remove openblas_set_num_threads
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","_ \@", $count, "\n";
$count ++;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
foreach $objs (@need_2underscore_objs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","__ \@", $count, "\n";
$count ++;
print "\t",$objs, "__=$objs","__ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "__ \@", $count, "\n";
$count ++;
}
#for misc_common_objs
foreach $objs (@misc_common_objs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t",$objs, "_=$objs","_ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "_ \@", $count, "\n";
$count ++;
}
#for openblas_set_num_threads
print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n";
$count ++;
foreach $objs (@no_underscore_objs) {
print "\t",$objs,"=$objs"," \@", $count, "\n";
@@ -2808,6 +2871,9 @@ if ($ARGV[0] eq "win2khpl"){
}
if ($ARGV[0] eq "microsoft"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
print "EXPORTS\n";
$count = 1;
foreach $objs (@underscore_objs) {
@@ -2822,10 +2888,25 @@ if ($ARGV[0] eq "microsoft"){
print "\t$uppercase\_ = $objs","_\n";
$count ++;
}
foreach $objs (@need_2underscore_objs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t$objs=$objs","__ \@", $count, "\n";
$count ++;
print "\t",$objs, "__=$objs","__ \@", $count, "\n";
$count ++;
print "\t$uppercase=$objs", "__ \@", $count, "\n";
$count ++;
}
exit(0);
}
if ($ARGV[0] eq "win2kasm"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
print "\t.text\n";
foreach $objs (@underscore_objs) {
$uppercase = $objs;
@@ -2835,14 +2916,33 @@ if ($ARGV[0] eq "win2kasm"){
print "_", $uppercase, "_:\n";
print "\tjmp\t_", $objs, "_\n";
}
foreach $objs (@need_2underscore_objs) {
$uppercase = $objs;
$uppercase =~ tr/[a-z]/[A-Z]/;
print "\t.align 16\n";
print "\t.globl _", $uppercase, "__\n";
print "_", $uppercase, "__:\n";
print "\tjmp\t_", $objs, "__\n";
}
exit(0);
}
if ($ARGV[0] eq "linktest"){
@underscore_objs = (@underscore_objs, @misc_common_objs);
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
print "int main(void){\n";
foreach $objs (@underscore_objs) {
print $objs, $bu, "();\n" if $objs ne "xerbla";
}
foreach $objs (@need_2underscore_objs) {
print $objs, $bu, $bu, "();\n";
}
# if ($ARGV[4] == 0) {
foreach $objs (@no_underscore_objs) {
print $objs, "();\n";

31
f_check
View File

@@ -24,7 +24,7 @@ $compiler = "" if $compiler eq "f77";
if ($compiler eq "") {
@lists = ("f77", "g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
@lists = ("g77", "g95", "gfortran", "frt", "fort", "openf90", "openf95",
"sunf77", "sunf90", "sunf95",
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
@@ -114,6 +114,12 @@ if ($compiler eq "") {
$vendor = IBM;
$openmp = "-openmp";
}
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ /zho_ge__/) {
$need2bu = 1;
}
}
if ($vendor eq "") {
@@ -211,6 +217,10 @@ if (!$?) {
if ($?) {
$link = `$compiler $openmp -q32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=n32 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?);
}
@@ -219,6 +229,10 @@ if (!$?) {
if ($?) {
$link = `$compiler $openmp -q64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For gfortran MIPS
if ($?) {
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?);
}
@@ -237,6 +251,8 @@ if ($link ne "") {
$link =~ s/\-rpath\s+/\-rpath\@/g;
$link =~ s/\-rpath-link\s+/\-rpath-link\@/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@flags = map {s/^['"]|['"]$//g; $_} @flags;
@@ -257,7 +273,15 @@ if ($link ne "") {
$linker_L .= "-Wl,". $flags . " ";
}
if ($flags =~ /^\-rpath/) {
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
}
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /^\-rpath-link\@/) {
$flags =~ s/\@/\,/g;
if ($vendor eq "PGI") {
$flags =~ s/lib$/libso/;
@@ -301,6 +325,9 @@ print MAKEFILE "NOFORTRAN=1\n" if $nofortran == 1;
print CONFFILE "#define BUNDERSCORE\t$bu\n" if $bu ne "";
print CONFFILE "#define NEEDBUNDERSCORE\t1\n" if $bu ne "";
print CONFFILE "#define NEED2UNDERSCORES\t1\n" if $need2bu ne "";
print MAKEFILE "NEED2UNDERSCORES=1\n" if $need2bu ne "";
if (($linker_l ne "") || ($linker_a ne "")) {
print MAKEFILE "FEXTRALIB=$linker_L $linker_l $linker_a\n";

6
ftest3.f Normal file
View File

@@ -0,0 +1,6 @@
double complex function zho_ge()
zho_ge = (0.0d0,0.0d0)
return
end

View File

@@ -83,6 +83,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef linux
#include <sys/sysinfo.h>
#include <unistd.h>
#endif
/* #define FORCE_P2 */
@@ -96,14 +97,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PENRYN */
/* #define FORCE_DUNNINGTON */
/* #define FORCE_NEHALEM */
/* #define FORCE_SANDYBRIDGE */
/* #define FORCE_ATOM */
/* #define FORCE_ATHLON */
/* #define FORCE_OPTERON */
/* #define FORCE_OPTERON_SSE3 */
/* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_BULLDOZER */
/* #define FORCE_BOBCAT */
/* #define FORCE_PILEDRIVER */
/* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */
/* #define FORCE_NANO */
@@ -118,12 +122,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_ITANIUM2 */
/* #define FORCE_GENERIC */
/* #define FORCE_SPARC */
/* #define FORCE_SPARCV7 */
/* #define FORCE_GENERIC */
#ifdef FORCE_P2
#define FORCE
@@ -139,20 +143,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "P5"
#endif
#ifdef FORCE_COPPERMINE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif
#ifdef FORCE_KATMAI
#define FORCE
#define FORCE_INTEL
@@ -167,6 +157,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "KATMAI"
#endif
#ifdef FORCE_COPPERMINE
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PENTIUM3"
#define ARCHCONFIG "-DPENTIUM3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE "
#define LIBNAME "coppermine"
#define CORENAME "COPPERMINE"
#endif
#ifdef FORCE_NORTHWOOD
#define FORCE
#define FORCE_INTEL
@@ -350,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "OPTERON"
#endif
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER)
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_PILEDRIVER) || defined (FORCE_BULLDOZER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@@ -380,6 +384,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BOBCAT"
#endif
#if 0
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BULLDOZER"
#define ARCHCONFIG "-DBULLDOZER " \
"-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \
"-DHAVE_AVX -DHAVE_FMA4"
#define LIBNAME "bulldozer"
#define CORENAME "BULLDOZER"
#endif
#if 0
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "PILEDRIVER"
#define ARCHCONFIG "-DPILEDRIVER " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
#define LIBNAME "piledriver"
#define CORENAME "PILEDRIVER"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL
@@ -701,7 +737,8 @@ static int get_num_cores(void) {
#endif
#ifdef linux
return get_nprocs();
//returns the number of processors which are currently online
return sysconf(_SC_NPROCESSORS_ONLN);
#elif defined(OS_WINDOWS)
@@ -786,8 +823,12 @@ int main(int argc, char *argv[]){
#endif
#endif
#if NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif
break;

View File

@@ -8,7 +8,7 @@
int main(int argc, char **argv) {
if ((argc < 1) || (*argv[1] == '0')) {
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
@@ -22,10 +22,48 @@ int main(int argc, char **argv) {
printf("ZGEMM_UNROLL_N=%d\n", ZGEMM_DEFAULT_UNROLL_N);
printf("XGEMM_UNROLL_M=%d\n", XGEMM_DEFAULT_UNROLL_M);
printf("XGEMM_UNROLL_N=%d\n", XGEMM_DEFAULT_UNROLL_N);
#ifdef CGEMM3M_DEFAULT_UNROLL_M
printf("CGEMM3M_UNROLL_M=%d\n", CGEMM3M_DEFAULT_UNROLL_M);
#else
printf("CGEMM3M_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
#endif
#ifdef CGEMM3M_DEFAULT_UNROLL_N
printf("CGEMM3M_UNROLL_N=%d\n", CGEMM3M_DEFAULT_UNROLL_N);
#else
printf("CGEMM3M_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
#endif
#ifdef ZGEMM3M_DEFAULT_UNROLL_M
printf("ZGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("ZGEMM3M_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
#endif
#ifdef ZGEMM3M_DEFAULT_UNROLL_N
printf("ZGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("ZGEMM3M_UNROLL_N=%d\n", DGEMM_DEFAULT_UNROLL_N);
#endif
#ifdef XGEMM3M_DEFAULT_UNROLL_M
printf("XGEMM3M_UNROLL_M=%d\n", ZGEMM3M_DEFAULT_UNROLL_M);
#else
printf("XGEMM3M_UNROLL_M=%d\n", QGEMM_DEFAULT_UNROLL_M);
#endif
#ifdef XGEMM3M_DEFAULT_UNROLL_N
printf("XGEMM3M_UNROLL_N=%d\n", ZGEMM3M_DEFAULT_UNROLL_N);
#else
printf("XGEMM3M_UNROLL_N=%d\n", QGEMM_DEFAULT_UNROLL_N);
#endif
}
if ((argc >= 1) && (*argv[1] == '1')) {
if ((argc >= 2) && (*argv[1] == '1')) {
printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float)));
printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double)));
printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float)));
@@ -34,7 +72,7 @@ int main(int argc, char **argv) {
#ifdef USE64BITINT
printf("#define USE64BITINT\n");
#endif
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD);
printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD);
}
return 0;

View File

@@ -318,7 +318,7 @@ CZBLAS3OBJS = \
ifndef NO_CBLAS
CFLAGS += -I.
override CFLAGS += -I.
SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS)
@@ -400,7 +400,7 @@ all :: libs
ifdef FUNCTION_PROFILE
$(BLASOBJS) $(BLASOBJS_P) : functable.h
$(BLASOBJS) $(BLASOBJS_P) : CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F)
$(BLASOBJS) $(BLASOBJS_P) : override CFLAGS += -DPROFILE_FUNC_NAME=interface_$(*F)
functable.h : Makefile
./create $(FUNCALLFILES) > functable.h
@@ -420,7 +420,7 @@ level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
$(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : CFLAGS += -DCBLAS
$(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS
srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c
$(CC) $(CFLAGS) -c $< -o $(@F)

View File

@@ -60,6 +60,8 @@ static blasint (*trtri_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *
};
#endif
extern void BLASFUNC(dtrtrilapack)(char *UPLO, char *DIAG, int *N, double *a, int *ldA, int *Info);
int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
blas_arg_t args;
@@ -83,6 +85,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
TOUPPER(uplo_arg);
TOUPPER(diag_arg);
uplo = -1;
if (uplo_arg == 'U') uplo = 0;
if (uplo_arg == 'L') uplo = 1;
@@ -90,6 +93,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (diag_arg == 'U') diag = 0;
if (diag_arg == 'N') diag = 1;
info = 0;
if (args.lda < MAX(1,args.n)) info = 5;
if (args.n < 0) info = 3;
@@ -129,6 +133,18 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
if (args.nthreads == 1) {
#endif
#if DOUBLE
// double trtri_U single thread error
// call dtrtri from lapack for a walk around.
if(uplo==0){
BLASFUNC(dtrtrilapack)(UPLO, DIAG, N, a, ldA, Info);
#ifndef PPC440
blas_memory_free(buffer);
#endif
return 0;
}
#endif
*Info = (trtri_single[(uplo << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP

View File

@@ -6,7 +6,7 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
ifdef TARGET_CORE
CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
BUILD_KERNEL = 1
KDIR =
TSUFFIX = _$(TARGET_CORE)
@@ -48,7 +48,7 @@ HPLOBJS = \
COMMONOBJS += lsame.$(SUFFIX) scabs1.$(SUFFIX) dcabs1.$(SUFFIX)
ifdef DYNAMIC_ARCH
ifeq ($(DYNAMIC_ARCH), 1)
SBLASOBJS += setparam$(TSUFFIX).$(SUFFIX)
CCOMMON_OPT += -DTS=$(TSUFFIX)
endif

View File

@@ -388,7 +388,7 @@ $(KDIR)xgerv_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerv_k$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DXDOUBLE -UCONJ -DXCONJ $< -o $@
$(KDIR)xgerd_k$(TSUFFIX).$(SUFFIX) $(KDIR)xgerd_k$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGERCKERNEL) $(XGERPARAM)
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ-DXCONJ $< -o $@
$(CC) -c $(CFLAGS) -DXDOUBLE -DCONJ -DXCONJ $< -o $@
$(KDIR)chemv_U$(TSUFFIX).$(SUFFIX) $(KDIR)chemv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CHEMV_U_KERNEL) $(CHEMV_U_PARAM)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -ULOWER -DHEMV $< -o $@

View File

@@ -1206,328 +1206,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_M
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(SUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(SUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(SUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(CFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)strsm_iunucopy$(TSUFFIX).$(SUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c
@@ -2608,328 +2608,328 @@ $(KDIR)xhemm_iutcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_utcopy_$(XGEMM_UNROLL_
$(KDIR)xhemm_iltcopy$(TSUFFIX).$(PSUFFIX) : generic/zhemm_ltcopy_$(XGEMM_UNROLL_M).c
$(CC) -c $(PFLAGS) $(NO_UNINITIALIZED_WARN) -DXDOUBLE -DCOMPLEX -UOUTER $< -DLOWER -o $@
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_N).c
$(KDIR)cgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(SGEMM_UNROLL_M).c
$(KDIR)cgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_oncopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xgemm3m_otcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_incopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_ncopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyb$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA $< -o $@
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyr$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xgemm3m_itcopyi$(TSUFFIX).$(PSUFFIX) : generic/zgemm3m_tcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DICOPY -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)csymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)csymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xsymm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xsymm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zsymm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_N).c
$(KDIR)chemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(SGEMM_UNROLL_M).c
$(KDIR)chemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(CGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -UDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_N).c
$(KDIR)zhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(DGEMM_UNROLL_M).c
$(KDIR)zhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(ZGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_oucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_N).c
$(KDIR)xhemm3m_olcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_N).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -DUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyb$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA $< -o $@
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyr$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DREAL_ONLY $< -o $@
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_iucopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_ucopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(QGEMM_UNROLL_M).c
$(KDIR)xhemm3m_ilcopyi$(TSUFFIX).$(PSUFFIX) : generic/zhemm3m_lcopy_$(XGEMM3M_UNROLL_M).c
$(CC) $(PFLAGS) $(NO_UNINITIALIZED_WARN) -c -DXDOUBLE -DCOMPLEX -UUSE_ALPHA -DIMAGE_ONLY $< -o $@
$(KDIR)strsm_iunucopy$(TSUFFIX).$(PSUFFIX) : generic/trsm_uncopy_$(SGEMM_UNROLL_M).c

View File

@@ -634,10 +634,10 @@ static void init_parameter(void) {
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
#endif
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH)
#if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
#ifdef DEBUG
fprintf(stderr, "Katmai, Coppermine, Banias\n");
fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
#endif
TABLE_NAME.sgemm_p = 64 * (l2 >> 7);
@@ -810,6 +810,38 @@ static void init_parameter(void) {
#endif
#endif
#ifdef BULLDOZER
#ifdef DEBUG
fprintf(stderr, "Bulldozer\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef PILEDRIVER
#ifdef DEBUG
fprintf(stderr, "Piledriver\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO
#ifdef DEBUG

View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@@ -596,7 +596,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 4 * SIZE(BB), %xmm2
@@ -842,7 +842,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -1168,7 +1168,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1198,7 +1198,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -1347,7 +1347,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -1531,7 +1531,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -1778,7 +1778,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -1793,7 +1793,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -1924,7 +1924,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -2069,7 +2069,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@@ -89,17 +89,22 @@
#endif
#define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA 16 + STACKSIZE(%esp)
#define A 20 + STACKSIZE(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp)
#define STACK_X 28 + STACKSIZE(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp)
#define Y 36 + STACKSIZE(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp)
#define BUFFER 44 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -114,6 +119,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -121,6 +127,33 @@
PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $21,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -651,12 +684,22 @@
addss 0 * SIZE(X), %xmm0
movss %xmm0, (Y1)
ALIGN_3
.L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -76,17 +76,22 @@
#endif
#define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA 16 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -101,6 +106,8 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -108,6 +115,33 @@
PROFCODE
movl Y,J
movl J,YY # backup Y
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # backup MM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -677,10 +711,22 @@
ALIGN_3
.L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -89,17 +89,22 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA 16 + STACKSIZE(%esp)
#define A 20 + STACKSIZE(%esp)
#define STACK_LDA 24 + STACKSIZE(%esp)
#define STACK_X 28 + STACKSIZE(%esp)
#define STACK_INCX 32 + STACKSIZE(%esp)
#define Y 36 + STACKSIZE(%esp)
#define STACK_INCY 40 + STACKSIZE(%esp)
#define BUFFER 44 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 20 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 24 + STACKSIZE+ARGS(%esp)
#define STACK_X 28 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 32 + STACKSIZE+ARGS(%esp)
#define Y 36 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 40 + STACKSIZE+ARGS(%esp)
#define BUFFER 44 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -114,6 +119,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -121,8 +127,35 @@
PROFCODE
movl STACK_LDA, LDA
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $22,J # J=2^24*sizeof(float)=buffer size(16MB)
subl $8, J # Don't use last 8 float in the buffer.
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A # mov AA to A
movl XX,%eax
movl %eax,X
movl STACK_LDA, LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -198,6 +231,20 @@
jg .L06
ALIGN_4
//Padding zero to prevent loading the dirty number from buffer.
movl M, I
movl $8, J
andl $7, I
xorps %xmm0, %xmm0
subl I, J
ALIGN_2
.L07:
movss %xmm0, 0 * SIZE(Y1)
addl $SIZE, Y1
decl J
jg .L07
ALIGN_4
.L10:
movl Y, Y1
@@ -628,10 +675,22 @@
ALIGN_4
.L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -76,18 +76,23 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA 16 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define AA 4+ARGS(%esp)
#define XX 8+ARGS(%esp)
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA 16 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -101,6 +106,8 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -108,8 +115,36 @@
PROFCODE
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA # backup A
movl M,J
movl J,MMM # mov M to MMM
.L0t:
xorl J,J
addl $1,J
sall $21,J # J=2^21*sizeof(double)=buffer size(16MB)
subl $4, J # Don't use last 4 double in the buffer.
subl J,MMM # MMM=MMM-J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl XX,%eax
movl %eax, X
movl AA,%eax
movl %eax,A # mov AA to A
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -117,6 +152,7 @@
leal (,INCY, SIZE), INCY
leal (,LDA, SIZE), LDA
subl $-16 * SIZE, A
cmpl $0, N
@@ -560,10 +596,22 @@
ALIGN_4
.L999:
movl M,J
leal (,J,SIZE),%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -74,11 +74,11 @@
#else
movl %eax, %ecx
subl $32, %ecx
cmovg %ecx, %eax
cmovge %ecx, %eax
movl %edx, %ecx
subl $32, %ecx
cmovg %ecx, %edx
cmovge %ecx, %edx
subl %eax, %edx
movl $0, %eax

View File

@@ -269,7 +269,7 @@
sarl $5, I
jle .L113
#if defined(BARCELONA)
#if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulps -32 * SIZE(X), %xmm1

View File

@@ -253,7 +253,7 @@
sarl $4, I
jle .L113
#if defined(BARCELONA)
#if defined(BARCELONA) || defined(BULLDOZER)
movaps %xmm0, %xmm1
mulpd -16 * SIZE(X), %xmm1

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -439,7 +439,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1697,7 +1697,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -437,7 +437,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -833,7 +833,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1848,7 +1848,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2109,7 +2109,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2429,7 +2429,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2952,7 +2952,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -3148,7 +3148,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3389,7 +3389,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -910,7 +910,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@@ -1439,7 +1439,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -872,7 +872,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -1316,7 +1316,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -1855,7 +1855,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -2249,7 +2249,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2562,7 +2562,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2957,7 +2957,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -3280,7 +3280,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -3515,7 +3515,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@@ -1036,7 +1036,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
@@ -2224,7 +2224,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2

View File

@@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@@ -439,7 +439,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -758,7 +758,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -993,7 +993,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@@ -1324,7 +1324,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -1718,7 +1718,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -2031,7 +2031,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -2859,7 +2859,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@@ -3303,7 +3303,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2

View File

@@ -1541,6 +1541,16 @@
popl %ebx
popl %esi
popl %edi
/*remove the hidden return value address from the stack.*/
#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX)
#ifdef MS_ABI
/* For MingW GCC >= 4.7. It is compatible with MSVC ABI. http://gcc.gnu.org/bugzilla/show_bug.cgi?id=36834 */
ret
#else
/* remove the hidden return value address from the stack. For MingW GCC < 4.7 */
ret $0x4
#endif
#else
/*remove the hidden return value address from the stack on Linux.*/
ret $0x4
#endif
EPILOGUE

View File

@@ -74,7 +74,7 @@
#define BB %ecx
#define LDC %ebp
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
#define movsd movlps
#endif
@@ -625,7 +625,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 4 * SIZE(BB), %xmm2
@@ -870,7 +870,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@@ -1173,7 +1173,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@@ -1203,7 +1203,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@@ -1359,7 +1359,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@@ -1536,7 +1536,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@@ -1794,7 +1794,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@@ -1809,7 +1809,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@@ -1936,7 +1936,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@@ -2069,7 +2069,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@@ -71,7 +71,7 @@
#define movsd movlps
#endif
#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5)
@@ -89,18 +89,23 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define YY 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,6 +136,33 @@
PROFCODE
movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -595,10 +628,21 @@
ALIGN_3
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -58,7 +58,7 @@
#define movsd movlps
#endif
#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5)
@@ -76,18 +76,23 @@
#endif
#define STACKSIZE 16
#define ARGS 16
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define YY 4 + ARGS(%esp)
#define AA 8 + ARGS(%esp)
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -110,6 +115,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,6 +123,33 @@
PROFCODE
movl Y,J
movl J,YY
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_3
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl YY,J
movl J,Y
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
@@ -458,10 +491,21 @@
ALIGN_3
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCY,INCY
imull INCY,%eax
addl %eax,YY
jmp .L0t
ALIGN_3
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -71,7 +71,7 @@
#define movsd movlps
#endif
#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 5)
@@ -89,18 +89,23 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 20 + STACKSIZE(%esp)
#define A 24 + STACKSIZE(%esp)
#define STACK_LDA 28 + STACKSIZE(%esp)
#define STACK_X 32 + STACKSIZE(%esp)
#define STACK_INCX 36 + STACKSIZE(%esp)
#define Y 40 + STACKSIZE(%esp)
#define STACK_INCY 44 + STACKSIZE(%esp)
#define BUFFER 48 + STACKSIZE(%esp)
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 20 + STACKSIZE+ARGS(%esp)
#define A 24 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 28 + STACKSIZE+ARGS(%esp)
#define STACK_X 32 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 36 + STACKSIZE+ARGS(%esp)
#define Y 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 44 + STACKSIZE+ARGS(%esp)
#define BUFFER 48 + STACKSIZE+ARGS(%esp)
#define MMM 0+ARGS(%esp)
#define XX 4+ARGS(%esp)
#define AA 8+ARGS(%esp)
#define I %eax
#define J %ebx
@@ -123,6 +128,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -130,8 +136,35 @@
PROFCODE
movl STACK_LDA, LDA
movl STACK_X, X
movl X,XX
movl A,J
movl J,AA #backup A
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $20,J
subl $8,J
subl J,MMM #MMM-=J
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax,M
.L00t:
movl AA,%eax
movl %eax,A
movl XX,%eax
movl %eax,X
movl STACK_LDA,LDA
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -513,10 +546,22 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT, %eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -58,7 +58,7 @@
#define movsd movlps
#endif
#ifdef BARCELONA
#if defined(BARCELONA) || defined(BULLDOZER)
#define PREFETCH prefetchnta
#define PREFETCHW prefetchw
#define PREFETCHSIZE (8 * 5)
@@ -76,19 +76,24 @@
#endif
#define STACKSIZE 16
#define ARGS 20
#define M 4 + STACKSIZE+ARGS(%esp)
#define N 8 + STACKSIZE+ARGS(%esp)
#define ALPHA_R 16 + STACKSIZE+ARGS(%esp)
#define ALPHA_I 24 + STACKSIZE+ARGS(%esp)
#define A 32 + STACKSIZE+ARGS(%esp)
#define STACK_LDA 36 + STACKSIZE+ARGS(%esp)
#define STACK_X 40 + STACKSIZE+ARGS(%esp)
#define STACK_INCX 44 + STACKSIZE+ARGS(%esp)
#define Y 48 + STACKSIZE+ARGS(%esp)
#define STACK_INCY 52 + STACKSIZE+ARGS(%esp)
#define BUFFER 56 + STACKSIZE+ARGS(%esp)
#define MMM 0 + ARGS(%esp)
#define AA 4 + ARGS(%esp)
#define XX 8 + ARGS(%esp)
#define M 4 + STACKSIZE(%esp)
#define N 8 + STACKSIZE(%esp)
#define ALPHA_R 16 + STACKSIZE(%esp)
#define ALPHA_I 24 + STACKSIZE(%esp)
#define A 32 + STACKSIZE(%esp)
#define STACK_LDA 36 + STACKSIZE(%esp)
#define STACK_X 40 + STACKSIZE(%esp)
#define STACK_INCX 44 + STACKSIZE(%esp)
#define Y 48 + STACKSIZE(%esp)
#define STACK_INCY 52 + STACKSIZE(%esp)
#define BUFFER 56 + STACKSIZE(%esp)
#define I %eax
#define J %ebx
@@ -110,6 +115,7 @@
PROLOGUE
subl $ARGS,%esp
pushl %ebp
pushl %edi
pushl %esi
@@ -117,8 +123,35 @@
PROFCODE
movl STACK_X, X
movl X, XX
movl A,J
movl J,AA
movl M,J
movl J,MMM
.L0t:
xorl J,J
addl $1,J
sall $18,J
subl $4,J
subl J,MMM
movl J,M
jge .L00t
ALIGN_4
movl MMM,%eax
addl J,%eax
jle .L999x
movl %eax, M
.L00t:
movl XX, %eax
movl %eax, X
movl AA,%eax
movl %eax,A
movl STACK_LDA, LDA
movl STACK_X, X
movl STACK_INCX, INCX
movl STACK_INCY, INCY
@@ -188,7 +221,7 @@
movl Y, Y1
movl N, J
ALIGN_3
ALIGN_4
.L11:
movl BUFFER, X
@@ -395,10 +428,21 @@
ALIGN_4
.L999:
movl M,%eax
sall $ZBASE_SHIFT,%eax
addl %eax,AA
movl STACK_INCX,INCX
imull INCX,%eax
addl %eax,XX
jmp .L0t
ALIGN_4
.L999x:
popl %ebx
popl %esi
popl %edi
popl %ebp
addl $ARGS,%esp
ret
EPILOGUE

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -533,7 +533,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -994,7 +994,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@@ -0,0 +1,70 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@@ -0,0 +1,70 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
DGEMVNKERNEL = dgemv_n_bulldozer.S
DGEMVTKERNEL = dgemv_t_bulldozer.S
DAXPYKERNEL = daxpy_bulldozer.S
DDOTKERNEL = ddot_bulldozer.S
DCOPYKERNEL = dcopy_bulldozer.S
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@@ -69,7 +69,7 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +79,10 @@
SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
shufps $0, ALPHA, ALPHA
leaq (, INCX, SIZE), INCX

View File

@@ -69,7 +69,6 @@
#endif
movaps %xmm0, ALPHA
#else
movaps %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
@@ -79,6 +78,10 @@
SAVEREGISTERS
#ifdef WINDOWS_ABI
movaps %xmm3, ALPHA
#endif
unpcklpd ALPHA, ALPHA
leaq (, INCX, SIZE), INCX

File diff suppressed because it is too large Load Diff

View File

@@ -47,14 +47,22 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define XX 88(%rsp)
#define LDAX 96(%rsp)
#define ALPHAR 104(%rsp)
#define ALPHAI 112(%rsp)
#define M %rdi
#define N %rsi
#define A %rcx
@@ -66,7 +74,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +86,14 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define XX 256(%rsp)
#define LDAX 264(%rsp)
#define ALPHAR 272(%rsp)
#define ALPHAI 280(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -142,9 +158,37 @@
movaps %xmm3, %xmm0
movss OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movq X, XX
movq OLD_Y, Y
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq XX, X
movq OLD_INCX, INCX
movq OLD_Y, Y
# movq OLD_Y, Y
movq OLD_INCY, INCY
movq OLD_BUFFER, BUFFER
@@ -4274,6 +4318,11 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -47,13 +47,19 @@
#ifndef WINDOWS_ABI
#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_INCX 8 + STACKSIZE(%rsp)
#define OLD_Y 16 + STACKSIZE(%rsp)
#define OLD_INCY 24 + STACKSIZE(%rsp)
#define OLD_BUFFER 32 + STACKSIZE(%rsp)
#define ALPHA 48 (%rsp)
#define MMM 64(%rsp)
#define NN 72(%rsp)
#define AA 80(%rsp)
#define LDAX 88(%rsp)
#define ALPHAR 96(%rsp)
#define ALPHAI 104(%rsp)
#define M %rdi
#define N %rsi
@@ -66,7 +72,7 @@
#else
#define STACKSIZE 256
#define STACKSIZE 288
#define OLD_ALPHA_I 40 + STACKSIZE(%rsp)
#define OLD_A 48 + STACKSIZE(%rsp)
@@ -78,6 +84,13 @@
#define OLD_BUFFER 96 + STACKSIZE(%rsp)
#define ALPHA 224 (%rsp)
#define MMM 232(%rsp)
#define NN 240(%rsp)
#define AA 248(%rsp)
#define LDAX 256(%rsp)
#define ALPHAR 264(%rsp)
#define ALPHAI 272(%rsp)
#define M %rcx
#define N %rdx
#define A %r8
@@ -144,6 +157,32 @@
movss OLD_ALPHA_I, %xmm1
#endif
movq A, AA
movq N, NN
movq M, MMM
movq LDA, LDAX
movss %xmm0,ALPHAR
movss %xmm1,ALPHAI
.L0t:
xorq I,I
addq $1,I
salq $20,I
subq I,MMM
movq I,M
movss ALPHAR,%xmm0
movss ALPHAI,%xmm1
jge .L00t
movq MMM,M
addq I,M
jle .L999x
.L00t:
movq AA, A
movq NN, N
movq LDAX, LDA
movq OLD_INCX, INCX
movq OLD_Y, Y
movq OLD_INCY, INCY
@@ -4350,6 +4389,11 @@
ALIGN_3
.L999:
movq M, I
salq $ZBASE_SHIFT,I
addq I,AA
jmp .L0t
.L999x:
movq 0(%rsp), %rbx
movq 8(%rsp), %rbp
movq 16(%rsp), %r12

View File

@@ -0,0 +1,408 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#ifndef WINDOWS_ABI
#define M ARG1
#define X ARG4
#define INCX ARG5
#define Y ARG6
#define INCY ARG2
#else
#define M ARG1
#define X ARG2
#define INCX ARG3
#define Y ARG4
#define INCY %r10
#endif
#define YY %r11
#define ALPHA %xmm15
#define A_PRE 640
#include "l1param.h"
PROLOGUE
PROFCODE
#ifndef WINDOWS_ABI
#ifndef XDOUBLE
movq 8(%rsp), INCY
#else
movq 24(%rsp), INCY
#endif
vmovups %xmm0, ALPHA
#else
vmovups %xmm3, ALPHA
movq 40(%rsp), X
movq 48(%rsp), INCX
movq 56(%rsp), Y
movq 64(%rsp), INCY
#endif
SAVEREGISTERS
unpcklpd ALPHA, ALPHA
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
testq M, M
jle .L47
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40
testq $SIZE, Y
je .L10
movsd (X), %xmm0
mulsd ALPHA, %xmm0
addsd (Y), %xmm0
movsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4
.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
movq M, %rax
sarq $4, %rax
jle .L13
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
decq %rax
jle .L12
ALIGN_3
.L11:
prefetchnta A_PRE(Y)
vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta A_PRE(X)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
prefetchnta A_PRE+64(Y)
vmovups 0 * SIZE(X), %xmm0
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vmovups 2 * SIZE(X), %xmm1
vmovups 4 * SIZE(X), %xmm2
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups 6 * SIZE(X), %xmm3
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3
.L12:
vmovups -8 * SIZE(X), %xmm4
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups -2 * SIZE(X), %xmm7
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4
vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5
vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6
vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3
.L13:
movq M, %rax
andq $8, %rax
jle .L14
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2
vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L14:
movq M, %rax
andq $4, %rax
jle .L15
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L15:
movq M, %rax
andq $2, %rax
jle .L16
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L16:
movq M, %rax
andq $1, %rax
jle .L19
ALIGN_3
vmovsd -16 * SIZE(X), %xmm0
vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3
.L19:
xorq %rax,%rax
RESTOREREGISTERS
ret
ALIGN_3
.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45
prefetchnta 512(X)
prefetchnta 512+64(X)
prefetchnta 512+128(X)
prefetchnta 512+192(X)
prefetchnta 512(Y)
prefetchnta 512+64(Y)
prefetchnta 512+128(Y)
prefetchnta 512+192(Y)
ALIGN_3
.L41:
vmovsd 0 * SIZE(X), %xmm0
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm0 , %xmm0
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm6
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6
addq INCY, YY
vmovsd 0 * SIZE(X), %xmm1
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm1 , %xmm1
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm7
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7
addq INCY, YY
vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0
vmovsd 0 * SIZE(X), %xmm2
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm2 , %xmm2
addq INCX, X
vmovsd 0 * SIZE(YY), %xmm8
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8
addq INCY, YY
vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1
vmovsd 0 * SIZE(X), %xmm3
addq INCX, X
vmovhpd 0 * SIZE(X), %xmm3 , %xmm3
addq INCX, X
vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2
vmovsd 0 * SIZE(YY), %xmm9
addq INCY, YY
vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9
addq INCY, YY
vmovsd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm0, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm1, 0 * SIZE(Y)
addq INCY, Y
vmovsd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm2, 0 * SIZE(Y)
addq INCY, Y
vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3
vmovsd %xmm3, 0 * SIZE(Y)
addq INCY, Y
vmovhpd %xmm3, 0 * SIZE(Y)
addq INCY, Y
decq %rax
jg .L41
ALIGN_3
.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3
.L46:
vmovsd (X), %xmm0
addq INCX, X
vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3
.L47:
xorq %rax, %rax
RESTOREREGISTERS
ret
EPILOGUE

View File

@@ -0,0 +1,291 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define M ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif
#include "l1param.h"
#define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG
#define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2
#define A_PRE 640
#define B_PRE 640
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
SAVEREGISTERS
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
cmpq $SIZE, INCX
jne .L40
cmpq $SIZE, INCY
jne .L40
testq $SIZE, X
je .L10
vmovsd (X), %xmm0
vmovsd %xmm0, (Y)
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq M
jle .L19
ALIGN_4
.L10:
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
movq M, %rax
sarq $4, %rax
jle .L13
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups -8 * SIZE(X), %xmm4
vmovups -6 * SIZE(X), %xmm5
vmovups -4 * SIZE(X), %xmm6
vmovups -2 * SIZE(X), %xmm7
decq %rax
jle .L12
ALIGN_4
.L11:
prefetchnta A_PRE(X)
nop
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
prefetchnta B_PRE(Y)
nop
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
VLOAD( 0 * SIZE, X, %xmm0)
VLOAD( 2 * SIZE, X, %xmm1)
VLOAD( 4 * SIZE, X, %xmm2)
VLOAD( 6 * SIZE, X, %xmm3)
prefetchnta A_PRE+64(X)
nop
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
prefetchnta B_PRE+64(Y)
nop
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
VLOAD( 8 * SIZE, X, %xmm4)
VLOAD(10 * SIZE, X, %xmm5)
subq $-16 * SIZE, Y
VLOAD(12 * SIZE, X, %xmm6)
VLOAD(14 * SIZE, X, %xmm7)
subq $-16 * SIZE, X
decq %rax
jg .L11
ALIGN_3
.L12:
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
vmovups %xmm4, -8 * SIZE(Y)
vmovups %xmm5, -6 * SIZE(Y)
vmovups %xmm6, -4 * SIZE(Y)
vmovups %xmm7, -2 * SIZE(Y)
subq $-16 * SIZE, Y
subq $-16 * SIZE, X
ALIGN_3
.L13:
testq $8, M
jle .L14
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups -12 * SIZE(X), %xmm2
vmovups -10 * SIZE(X), %xmm3
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
vmovups %xmm2, -12 * SIZE(Y)
vmovups %xmm3, -10 * SIZE(Y)
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L14:
testq $4, M
jle .L15
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups -14 * SIZE(X), %xmm1
vmovups %xmm0, -16 * SIZE(Y)
vmovups %xmm1, -14 * SIZE(Y)
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L15:
testq $2, M
jle .L16
ALIGN_3
vmovups -16 * SIZE(X), %xmm0
vmovups %xmm0, -16 * SIZE(Y)
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L16:
testq $1, M
jle .L19
ALIGN_3
vmovsd -16 * SIZE(X), %xmm0
vmovsd %xmm0, -16 * SIZE(Y)
ALIGN_3
.L19:
xorq %rax,%rax
RESTOREREGISTERS
ret
ALIGN_3
.L40:
movq M, %rax
sarq $3, %rax
jle .L45
ALIGN_3
.L41:
vmovsd (X), %xmm0
addq INCX, X
vmovsd (X), %xmm4
addq INCX, X
vmovsd (X), %xmm1
addq INCX, X
vmovsd (X), %xmm5
addq INCX, X
vmovsd (X), %xmm2
addq INCX, X
vmovsd (X), %xmm6
addq INCX, X
vmovsd (X), %xmm3
addq INCX, X
vmovsd (X), %xmm7
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
vmovsd %xmm4, (Y)
addq INCY, Y
vmovsd %xmm1, (Y)
addq INCY, Y
vmovsd %xmm5, (Y)
addq INCY, Y
vmovsd %xmm2, (Y)
addq INCY, Y
vmovsd %xmm6, (Y)
addq INCY, Y
vmovsd %xmm3, (Y)
addq INCY, Y
vmovsd %xmm7, (Y)
addq INCY, Y
decq %rax
jg .L41
ALIGN_3
.L45:
movq M, %rax
andq $7, %rax
jle .L47
ALIGN_3
.L46:
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, (Y)
addq INCY, Y
decq %rax
jg .L46
ALIGN_3
.L47:
xorq %rax, %rax
RESTOREREGISTERS
ret
EPILOGUE

View File

@@ -0,0 +1,311 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#define N ARG1 /* rdi */
#define X ARG2 /* rsi */
#define INCX ARG3 /* rdx */
#define Y ARG4 /* rcx */
#ifndef WINDOWS_ABI
#define INCY ARG5 /* r8 */
#else
#define INCY %r10
#endif
#define A_PRE 512
#include "l1param.h"
PROLOGUE
PROFCODE
#ifdef WINDOWS_ABI
movq 40(%rsp), INCY
#endif
SAVEREGISTERS
leaq (, INCX, SIZE), INCX
leaq (, INCY, SIZE), INCY
vxorps %xmm0, %xmm0 , %xmm0
vxorps %xmm1, %xmm1 , %xmm1
vxorps %xmm2, %xmm2 , %xmm2
vxorps %xmm3, %xmm3 , %xmm3
cmpq $0, N
jle .L999
cmpq $SIZE, INCX
jne .L50
cmpq $SIZE, INCY
jne .L50
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
testq $SIZE, Y
je .L10
vmovsd -16 * SIZE(X), %xmm0
vmulsd -16 * SIZE(Y), %xmm0 , %xmm0
addq $1 * SIZE, X
addq $1 * SIZE, Y
decq N
ALIGN_2
.L10:
movq N, %rax
sarq $4, %rax
jle .L14
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7
vmovups -8 * SIZE(X), %xmm8
vmovups -6 * SIZE(X), %xmm9
vmovups -4 * SIZE(X), %xmm10
vmovups -2 * SIZE(X), %xmm11
decq %rax
jle .L12
ALIGN_3
.L11:
prefetchnta A_PRE(Y)
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
prefetchnta A_PRE(X)
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
vmovups 0 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vmovups 2 * SIZE(X), %xmm5
vmovups 4 * SIZE(X), %xmm6
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
vmovups 6 * SIZE(X), %xmm7
prefetchnta A_PRE+64(Y)
vmovups 8 * SIZE(X), %xmm8
vmovups 10 * SIZE(X), %xmm9
prefetchnta A_PRE+64(X)
vmovups 12 * SIZE(X), %xmm10
vmovups 14 * SIZE(X), %xmm11
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
decq %rax
jg .L11
ALIGN_3
.L12:
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0
vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1
vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2
vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3
subq $-16 * SIZE, X
subq $-16 * SIZE, Y
ALIGN_3
.L14:
testq $15, N
jle .L999
testq $8, N
jle .L15
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vmovups -12 * SIZE(X), %xmm6
vmovups -10 * SIZE(X), %xmm7
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2
vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3
addq $8 * SIZE, X
addq $8 * SIZE, Y
ALIGN_3
.L15:
testq $4, N
jle .L16
vmovups -16 * SIZE(X), %xmm4
vmovups -14 * SIZE(X), %xmm5
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1
addq $4 * SIZE, X
addq $4 * SIZE, Y
ALIGN_3
.L16:
testq $2, N
jle .L17
vmovups -16 * SIZE(X), %xmm4
vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0
addq $2 * SIZE, X
addq $2 * SIZE, Y
ALIGN_3
.L17:
testq $1, N
jle .L999
vmovsd -16 * SIZE(X), %xmm4
vmovsd -16 * SIZE(Y), %xmm5
vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0
jmp .L999
ALIGN_3
.L50:
movq N, %rax
sarq $3, %rax
jle .L55
ALIGN_3
.L53:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm5
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm9
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm6
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm10
addq INCY, Y
vmovsd 0 * SIZE(X), %xmm7
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm11
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2
vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3
decq %rax
jg .L53
ALIGN_3
.L55:
movq N, %rax
andq $7, %rax
jle .L999
ALIGN_3
.L56:
vmovsd 0 * SIZE(X), %xmm4
addq INCX, X
vmovsd 0 * SIZE(Y), %xmm8
addq INCY, Y
vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0
decq %rax
jg .L56
ALIGN_3
.L999:
vaddpd %xmm1, %xmm0 , %xmm0
vaddpd %xmm3, %xmm2 , %xmm2
vaddpd %xmm2, %xmm0 , %xmm0
vhaddpd %xmm0, %xmm0 , %xmm0
RESTOREREGISTERS
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More