Compare commits

..

199 Commits

Author SHA1 Message Date
Martin Kroeker
de8fff671d Revert "Use usleep instead of sched_yield by default" 2018-06-11 17:05:27 +02:00
Martin Kroeker
6f71c0fce4 Return a somewhat sane default value for L2 cache size if cpuid retur… (#1611)
* Return a somewhat sane default value for L2 cache size if cpuid returned something unexpected

Fixes #1610, the KVM hypervisor on Google Chromebooks returning zero for CPUID  0x80000006, causing DYNAMIC_ARCH
builds of OpenBLAS to hang
2018-06-11 13:26:19 +02:00
Martin Kroeker
3313e4b946 Merge pull request #1608 from martin-frbg/issue874
Enable parallel make on MS Windows by default
2018-06-09 19:57:33 +02:00
Martin Kroeker
e9cd11768c Enable parallel make on MS Windows by default
fixes #874
2018-06-09 17:54:36 +02:00
Martin Kroeker
0297b3211a Merge pull request #1605 from oon3m0oo/develop
Improve performance of GEMM for small matrices when SMP is defined.
2018-06-09 12:42:34 +02:00
Craig Donner
66316b9f4c Improve performance of GEMM for small matrices when SMP is defined.
Always checking num_cpu_avail() regardless of whether threading will actually
be used adds noticeable overhead for small matrices.  Most other uses of
num_cpu_avail() do so only if threading will be used, so do the same here.
2018-06-07 15:29:13 +01:00
Martin Kroeker
6adc4b7b36 Merge pull request #1601 from martin-frbg/zaxpy
Use a single thread for small input size in zaxpy
2018-06-07 14:09:58 +02:00
Martin Kroeker
2ade0ef085 Merge pull request #1600 from martin-frbg/noyield
Use usleep instead of sched_yield by default
2018-06-07 12:42:00 +02:00
Martin Kroeker
e8880c1699 Use a single thread for small input size
copies daxpy improvement from #27, see #1560
2018-06-07 10:26:55 +02:00
Martin Kroeker
ed7c4a043b Use usleep instead of sched_yield by default
sched_yield only burns cpu cycles, fixes #900,  see also #923, #1560
2018-06-07 10:18:26 +02:00
Martin Kroeker
cf234a0561 Merge pull request #1589 from fenrus75/skylakex
Initial support for SkylakeX / AVX512
2018-06-06 22:07:09 +02:00
Martin Kroeker
ae2a33128b Merge pull request #1599 from martin-frbg/c_check_avx512
Improved AVX512 test case for c_check
2018-06-06 18:42:42 +02:00
Martin Kroeker
e4718b1fee Better AVX512 test case 2018-06-06 16:51:30 +02:00
Martin Kroeker
9b87b64262 Improve AVX512 testcase
clang 3.4 managed to accept the original test code, only to fail on the actual Skylake asm later
2018-06-06 16:49:00 +02:00
Martin Kroeker
0218b884c1 Merge pull request #1598 from martin-frbg/issue1593-2
Restore _Atomic define before stdatomic.h for old gcc
2018-06-06 12:48:26 +02:00
Martin Kroeker
83da278093 Update common.h 2018-06-06 09:27:49 +02:00
Martin Kroeker
358d4df2bd Merge branch 'develop' into issue1593-2 2018-06-06 09:21:41 +02:00
Martin Kroeker
06d43760e4 Restore _Atomic define before stdatomic.h for old gcc
see #1593
2018-06-06 09:18:10 +02:00
Martin Kroeker
a4af8861ff Merge pull request #1597 from martin-frbg/cmake-avx512
Check build system support for AVX512 instructions
2018-06-06 07:22:20 +02:00
Martin Kroeker
7fb62aed7e Check build system support for AVX512 instructions 2018-06-05 23:29:33 +02:00
Martin Kroeker
f6021c798d Re-enable QUIET_MAKE 2018-06-05 19:09:38 +02:00
Martin Kroeker
e8002536ec disable quiet_make for the moment 2018-06-05 18:23:01 +02:00
Martin Kroeker
ce6317f6c0 Merge pull request #1594 from martin-frbg/issue1593
Fix inverted condition in _Atomic declaration
2018-06-05 16:02:51 +02:00
Martin Kroeker
15a78d6b66 export NO_AVX512 setting 2018-06-05 15:58:34 +02:00
Martin Kroeker
354a976a59 Fix inverted condition in _Atomic declaration
fixes #1593
2018-06-05 10:31:34 +02:00
Martin Kroeker
38ad05bd04 Extend loop range to find SkylakeX in force_coretype 2018-06-05 10:26:49 +02:00
Martin Kroeker
b7feded85a Propagate NO_AVX512 via CCOMMON_OPT 2018-06-05 10:24:05 +02:00
Martin Kroeker
dc9fe05ab5 Update cpuid_x86.c 2018-06-04 17:10:19 +02:00
Martin Kroeker
8be027e4c6 Update dynamic.c 2018-06-04 14:36:39 +02:00
Martin Kroeker
ac7b6e3e9a Fix misplaced endif 2018-06-04 08:23:40 +02:00
Martin Kroeker
fc66a0ec0b Merge pull request #1590 from martin-frbg/avx512_check
Disable AVX512 (Skylake X) support if the build system is too old
2018-06-04 08:18:38 +02:00
Arjan van de Ven
89372e0993 Use AVX512 also for DGEMM
this required switching to the generic gemm_beta code (which is faster anyway on SKX)
for both DGEMM and SGEMM

Performance for the not-retuned version is in the 30% range
2018-06-03 22:17:27 +00:00
Martin Kroeker
ef626c6824 typo fix 2018-06-04 00:13:19 +02:00
Martin Kroeker
83fec56a3f Disable AVX512 (Skylake X) support if the build system is too old 2018-06-04 00:01:11 +02:00
Martin Kroeker
5a51cf4576 Separate Skylake X from Skylake 2018-06-03 23:41:33 +02:00
Martin Kroeker
5a92b311e0 Separate Skylake X from Skylake 2018-06-03 23:29:07 +02:00
Martin Kroeker
a7d0f49cec Add SKYLAKEX to DYNAMIC_CORE list only if AVX512 is available 2018-06-03 23:13:25 +02:00
Martin Kroeker
f1fb9a4745 Propagate NO_AVX512 if needed 2018-06-03 13:48:27 +02:00
Martin Kroeker
0023515733 Typo fix (misplaced parenthesis) 2018-06-03 13:22:59 +02:00
Arjan van de Ven
99c7bba8e4 Initial support for SkylakeX / AVX512
This patch adds the basic infrastructure for adding the SkylakeX (Intel Skylake server)
target. The SkylakeX target will use the AVX512 (AVX512VL level) instruction set,
which brings 2 basic things:
1) 512 bit wide SIMD (2x width of AVX2)
2) 32 SIMD registers (2x the number on AVX2)

This initial patch only contains a trivial transofrmation of the Haswell SGEMM kernel
to AVX512VL; more will follow later but this patch aims to get the infrastructure
in place for this "later".

Full performance tuning has not been done yet; with more registers and wider SIMD
it's in theory possible to retune the kernels but even without that there's an
interesting enough performance increase (30-40% range) with just this change.
2018-06-03 07:58:52 +00:00
Martin Kroeker
36c4523d85 Merge pull request #1587 from matthew-brett/fix-compile-error-early-glibc
Revert "take out unused variables"
2018-06-02 10:02:38 +02:00
Matthew Brett
a8002e283a Revert "take out unused variables"
This reverts commit e5752ff9b3.

The variables i and n are used in the `#if !__GLIBC_PREREQ(2, 7)`
branch.

Closes gh-1586.
2018-06-01 23:20:00 +01:00
Martin Kroeker
401adddb2b Merge pull request #1585 from martin-frbg/lapack-253
Fixes from Lapack-Reference PR 253
2018-06-01 18:59:33 +02:00
Martin Kroeker
c5b13d4e10 Fixes from netlib PR 253 2018-06-01 15:14:45 +02:00
Martin Kroeker
677e42d7b0 Fixes from netlib PR 253
When minimal workspace is given in ?hesv_aa, ?sysv_aa, ?hesv_aa_2stage, ?sysv_aa_2stage, now no error is given
Quick return for ?laqr1
2018-06-01 15:12:59 +02:00
Martin Kroeker
e2a8c35e5a Fixes from netlib PR253
LAPACKE interfaces for Aasen's functions now call ?sytrf_aa and ?hetrf_aa instead of ?sytrf and ?hetrf
2018-06-01 15:08:14 +02:00
Martin Kroeker
1a49fb1c05 Merge pull request #1584 from martin-frbg/issue1503
Work around name clash with Windows10's winnt.h
2018-05-31 21:56:04 +02:00
Martin Kroeker
8562d5787a Merge pull request #1583 from martin-frbg/issue1575
Handle INCX=0,INCY=0 case
2018-05-31 21:55:26 +02:00
Martin Kroeker
93f1eb09c3 Merge pull request #1582 from martin-frbg/develop-031
Update version number on the develop branch to 0.3.1.dev
2018-05-31 21:55:07 +02:00
Martin Kroeker
c90bbda3df Merge pull request #1581 from martin-frbg/issue1574-2
Fix paths to LIN and EIG tests
2018-05-31 21:54:45 +02:00
Martin Kroeker
7df8c4f76f typo fix 2018-05-31 17:23:08 +02:00
Martin Kroeker
2fc748bf72 Restore optimized swap kernel now that we have a proper fix 2018-05-31 13:41:12 +02:00
Martin Kroeker
a91f1587b9 Work around name clash with Windows10's winnt.h
fixes #1503
2018-05-31 13:26:00 +02:00
Martin Kroeker
d1b7be14aa Handle INCX=0,INCY=0 case
Fixes #1575 (sswap/dswap failing the swap utest on x86) as suggested by atsampson.
2018-05-31 12:52:04 +02:00
Martin Kroeker
b491b10057 Update version to 0.3.1.dev 2018-05-31 12:44:36 +02:00
Martin Kroeker
5fae96fb70 Update version to 0.3.1.dev 2018-05-31 12:43:45 +02:00
Martin Kroeker
a7dbd4c57d Fix paths to LIN and EIG tests
should fix 1574
2018-05-31 11:19:33 +02:00
Martin Kroeker
2cae104b5e Merge pull request #1579 from martin-frbg/issue1574
Adapt lapack-test and blas-test to changes in netlib directory layout
2018-05-29 22:02:06 +02:00
Martin Kroeker
908d40be71 Adapt lapack-test and blas-test to changes in netlib directory layout
partial fix for #1574 - the problem with lapack_testing.py looks like an upstream bug
2018-05-29 14:27:46 +02:00
Zhang Xianyi
43e592ceb3 Add -lm for Android.
Conflicts:
	exports/Makefile
2018-05-24 21:02:42 +08:00
Martin Kroeker
f0f27868d8 Merge pull request #1572 from martin-frbg/issue1571
Use the new zrot.c on POWER8 for crot as well
2018-05-23 22:55:37 +02:00
Martin Kroeker
961d25e9c7 Use the new zrot.c on POWER8 for crot as well
fixes #1571 (the old zrot.S assembly does not handle incx=0 correctly)
2018-05-23 22:54:39 +02:00
Martin Kroeker
f5959f2543 Merge pull request #1567 from martin-frbg/mipstrmm
Revert " Switch mips32 target to USE_TRMM to fix complex TRMM"
2018-05-17 20:50:23 +02:00
Martin Kroeker
82012b960b Revert " Switch mips32 target to USE_TRMM to fix complex TRMM"
... as it was just a silly workaround for the issue seen in #1563, caused by #1419
2018-05-17 20:30:03 +02:00
Martin Kroeker
8dd3515fa2 Merge pull request #1565 from martin-frbg/mipstypo
Remove extraneous brace from previous commit of mips dsdot fix
2018-05-17 20:22:58 +02:00
Martin Kroeker
95f7f0229c Remove extraneous brace from previous commit 2018-05-17 18:43:59 +02:00
Martin Kroeker
5082fe4306 Merge pull request #1564 from martin-frbg/issue1563
Revert changes from PR#1419
2018-05-17 14:04:13 +02:00
Martin Kroeker
7a7619af6d Revert changes from PR#1419
at least one of these changes apparently is an oversimplification, leading to TRMM breakage on some platforms as observed in #1563
2018-05-17 11:40:08 +02:00
Martin Kroeker
9a400b7014 Merge pull request #1562 from martin-frbg/issue1561
Use correct data type for initializers of v2f64, v4f32
2018-05-15 17:46:09 +02:00
Martin Kroeker
893b535540 Use correct data type for initializers of v2f64, v4f32
Fixes #1561
2018-05-15 14:42:12 +02:00
Martin Kroeker
6791294312 Merge pull request #1559 from martin-frbg/buildconf
Add build-time configuration options to pkgconfig file
2018-05-14 18:49:53 +02:00
Martin Kroeker
ddb8b124de Merge pull request #1558 from martin-frbg/instpc
Overwrite any pre-existing openblas.pc rather than append to it
2018-05-14 17:38:12 +02:00
Martin Kroeker
191746c493 Merge pull request #1557 from martin-frbg/getconfig
Add threading and OpenMP information to output
2018-05-14 17:37:55 +02:00
Martin Kroeker
eb9b021d38 Add build-time configuration options to pkgconfig file 2018-05-14 00:10:15 +02:00
Martin Kroeker
7d7564568c Add build-time configuration options to pkgconfig file 2018-05-14 00:09:35 +02:00
Martin Kroeker
a07843bc93 Overwrite any pre-existing openblas.pc rather than append to it 2018-05-12 22:11:27 +02:00
Martin Kroeker
41ae8e8d67 Add threading and OpenMP information to output
For #1416 and #1529, more information about the options OpenBLAS was built with is needed. Additionally we may want to add this data to the openblas.pc file (but not all projects use pkgconfig, and as far as I am aware the cmake module for accessing it does not make such "private" declarations available)
2018-05-12 12:11:38 +02:00
Zhang Xianyi
9c1aa0b0fe Merge pull request #1556 from WestAlgo/develop
move _Atomic define to common.h
2018-05-11 17:02:47 +08:00
zhiyong.dang
53457f222f move _Atomic define to common.h 2018-05-11 00:13:16 -07:00
Zhang Xianyi
458e3af5b1 Merge pull request #1555 from WestAlgo/develop
Change _STDC_VERSION__ to __STDC_VERSION__
2018-05-11 12:25:24 +08:00
Zhiyong Dang
3716267124 Change _STDC_VERSION__ to __STDC_VERSION__
Change-Id: Id3fa4e8d9eedd4ef7230df69b611e7f397301a42
2018-05-11 12:15:08 +08:00
Zhang Xianyi
50acc40613 Merge pull request #1536 from WestAlgo/develop
Fix race condition in blas_server_omp.c
2018-05-11 10:09:14 +08:00
Martin Kroeker
c720f1f019 Merge pull request #1554 from martin-frbg/lapack-249
LAPACKE fixes from lapack PR249
2018-05-10 15:32:08 +02:00
Martin Kroeker
d7d950fcf2 LAPACKE fixes from lapack PR249
Copied from Reference-LAPACK/lapack#249, this fixes out-of-bounds memory accesses
in the nancheck calls of the LAPACKE lacgv, lassq,larfg,larfb,larfx and mtr functions
2018-05-10 13:15:42 +02:00
Martin Kroeker
12398e53ce Merge pull request #1553 from martin-frbg/ifort-openmpflag
Change -openmp to -fopenmp for ifort entry as well
2018-05-09 14:39:52 +02:00
Martin Kroeker
193f835662 Change -openmp to -fopenmp for ifort entry as well 2018-05-09 12:34:09 +02:00
Martin Kroeker
7e3151ead7 Merge pull request #1551 from martin-frbg/f_check_fix
Fixes for ifort 2018
2018-05-09 09:02:52 +02:00
Martin Kroeker
e3a069f108 Merge pull request #1550 from martin-frbg/ifort-openmpflag
Update compiler flag for openmp use with ICC
2018-05-09 09:02:38 +02:00
Martin Kroeker
6fff8c626a Merge pull request #1549 from martin-frbg/fix_ompcheck
Drop C-style "L" suffx from OPENMP version number tests in the LAPACK source
2018-05-08 23:52:55 +02:00
Martin Kroeker
d2b9389f1b Fixes for ifort 2018
1. the already deprecated -openmp option was removed in 2018, switch to -fopenmp
2. add leading blank in search for "zho_ge__" symbol to work around misleading tags in the 2018 assembly
Expected to fix #1548
2018-05-08 21:55:37 +02:00
Martin Kroeker
65b8a5c5d8 Update compiler flag for openmp use with ICC
The deprecated -openmp option was finally removed in favor of -qopenmp or -fopenmp, picking the latter to stay compatible with Intel compiler versions before 2015 (when -q options were introduced). Fixes #1546
2018-05-08 21:47:10 +02:00
Martin Kroeker
9795adc7ef Drop C-style "L" suffix from OPENMP version number in check 2018-05-08 21:39:42 +02:00
Martin Kroeker
1a8e487c4a Drop C-style "L" suffix from OPENMP version number in check 2018-05-08 21:38:25 +02:00
Martin Kroeker
5966fd52a2 Drop C-style "L" suffix from OPENMP version number in check 2018-05-08 21:36:56 +02:00
Martin Kroeker
dbafe6357b Merge pull request #1543 from martin-frbg/mips32
Fix MIPS32 build and add MIPS 1004K cpu (MT7621 SOC)
2018-05-02 22:47:45 +02:00
Martin Kroeker
71051259e0 Restore compiler options for mips P5600 target 2018-05-02 20:37:06 +02:00
Martin Kroeker
73cc321190 Add MIPS 1004K target 2018-05-02 20:27:56 +02:00
Martin Kroeker
018f2dad27 Switch mips32 target to USE_TRMM to fix complex TRMM 2018-05-02 20:25:32 +02:00
Martin Kroeker
9d5098dbc9 Add MIPS 1004K target (Mediatek MT7621 SOC) 2018-05-02 20:20:44 +02:00
Martin Kroeker
d94d7baf7e Add mips32r2 api target 2018-05-02 20:17:26 +02:00
Martin Kroeker
3af1b5c805 Make cpuid_mips compile again and add 1004K cpu 2018-05-02 20:12:25 +02:00
Martin Kroeker
88e224f4c0 Merge pull request #1542 from martin-frbg/quickdiv64
Avoid out-of-bounds accesses in blas_quickdivide on big X86 systems
2018-05-02 18:11:50 +02:00
Martin Kroeker
d0c0506588 Omit the divide table overflow check on small systems 2018-05-02 14:44:50 +02:00
Martin Kroeker
e93355e5e1 Omit the table overflow check when building for small systems 2018-05-02 14:43:08 +02:00
Martin Kroeker
c1eb06e102 Update common_x86_64.h 2018-04-29 14:40:12 +02:00
Martin Kroeker
8145ecd70b Avoid out-of-bounds reads from blas_quick_divide_table on big systems 2018-04-29 14:38:55 +02:00
Martin Kroeker
26ce518d46 Avoid out of bounds reads from blas_quick_divide_table on big systems
Should fix #1541
2018-04-29 14:34:33 +02:00
Martin Kroeker
1d27fa8507 Merge pull request #1539 from martin-frbg/ztrmv-1332
Disable multithreading in ztrmv
2018-04-27 23:10:21 +02:00
Martin Kroeker
802cf6b22d Merge pull request #1486 from martin-frbg/atomic
Use _Atomic instead of volatile for thread safety where C11 is supported
2018-04-27 23:09:57 +02:00
Martin Kroeker
894433a7c7 Update Makefile.rule 2018-04-27 12:08:06 +02:00
Zhiyong Dang
1b83341d19 Fix race condition in blas_server_omp.c
Change-Id: Ic896276cd073d6b41930c7c5a29d66348cd1725d
2018-04-27 17:00:42 +08:00
Martin Kroeker
954f1832de Merge pull request #1540 from martin-frbg/mips32-zasum
Fix typo in MIPS P5600 complex ASUM code selection
2018-04-25 23:23:00 +02:00
Martin Kroeker
941ad280a8 Fix typo in MIPS P5600 complex ASUM code selection 2018-04-25 22:50:10 +02:00
Martin Kroeker
a8ed428bab Disable multithreading in ztrmv
BLAS-Tester shows that the same problem exists as with DTRMV (issue #1332)
2018-04-25 22:35:46 +02:00
Martin Kroeker
1da365312a Merge pull request #1538 from martin-frbg/arm7utest
Fix handling of zero INCX, INCY in ArmV7 AXPY and ROT
2018-04-25 08:38:58 +02:00
Martin Kroeker
2d0929fa7c Move the test for zero incx,incy in ARMV7 ROT
to pass the related utest (see #1469)
2018-04-24 22:43:00 +02:00
Martin Kroeker
125343cc88 Drop test for zero incx,incy in armv7 AXPY
...to pass the related utest (see #1469)
2018-04-24 22:39:50 +02:00
Martin Kroeker
8a3b6fa108 Use generic zrot.c on ppc64/POWER6 to work around utest failure from … (#1535)
* Use generic C implementation of zrot on ppc64/POWER6 to work around utest failure from #1469
2018-04-23 19:05:49 +02:00
Martin Kroeker
78694f1b7e Merge pull request #1534 from xianyi/revert-1333-haswell32
Revert "Fix 32bit HASWELL builds"
2018-04-22 23:34:17 +02:00
Martin Kroeker
9c5518319a Revert "Fix 32bit HASWELL builds" 2018-04-22 20:20:04 +02:00
Martin Kroeker
86f49c529d Merge pull request #1532 from martin-frbg/utest-cblas
Do not try to build the fork utest when NO_CBLAS=1
2018-04-20 23:44:15 +02:00
Martin Kroeker
625c74a38f fork utest depends on CBLAS 2018-04-20 15:43:59 +02:00
Martin Kroeker
5fcaca6438 fork utest depends on CBLAS 2018-04-20 15:42:13 +02:00
Martin Kroeker
4fcdd24459 Merge pull request #1530 from ashwinyes/develop_20180419_Tx2AutoDetect
ARM64: Enable Auto Detection of ThunderX2T99
2018-04-19 14:10:57 +02:00
Ashwin Sekhar T K
68a3c4fca6 ARM64: Enable Auto Detection of ThunderX2T99 2018-04-19 09:05:25 +00:00
Martin Kroeker
0c4718c57a Merge pull request #1523 from martin-frbg/utest_waith
Include sys/types.h for proper typedefs related to wait()
2018-04-15 13:09:30 +02:00
Martin Kroeker
f29389c7ac Merge pull request #1520 from martin-frbg/cpucounts
Catch invalid cpu count returned by CPU_COUNT_S
2018-04-14 22:24:34 +02:00
Martin Kroeker
734d7c6a93 Include sys/types.h for proper typedefs related to wait()
Should fix #1519
2018-04-14 18:59:46 +02:00
Martin Kroeker
7c861605b2 Catch invalid cpu count returned by CPU_COUNT_S
mips32 was seen to return zero here, driving nthreads to zero with subsequent fpe in blas_quickdivide
2018-04-14 18:29:10 +02:00
Martin Kroeker
2ca0faf495 Merge pull request #1515 from martin-frbg/mipsdot
Correct precision of mips dsdot
2018-04-11 08:21:25 +02:00
Martin Kroeker
0fe434598b Fix precision of mips dsdot 2018-04-10 23:30:59 +02:00
Martin Kroeker
15c437e092 Merge pull request #1512 from ararslan/aa/travis-macos-2
Add macOS to the Travis testing matrix: Take 2!
2018-04-07 23:31:26 +02:00
Alex Arslan
b966bd79d5 Add a BINARY=32 build to macOS 2018-04-07 12:29:57 -07:00
Alex Arslan
2e988dbf35 Add macOS to the Travis testing matrix 2018-04-07 10:56:34 -07:00
Martin Kroeker
be6090d396 Merge pull request #1511 from xianyi/revert-1510-aa/travis-macos
Revert "Add macOS to the Travis testing matrix"
2018-04-07 13:29:31 +02:00
Martin Kroeker
daae8fd197 Revert "Add macOS to the Travis testing matrix" 2018-04-07 13:27:24 +02:00
Martin Kroeker
20c6c38e51 Merge branch 'develop' into atomic 2018-04-07 12:09:39 +02:00
Martin Kroeker
a1fb7670f7 Merge pull request #1510 from ararslan/aa/travis-macos
Add macOS to the Travis testing matrix
2018-04-07 12:07:12 +02:00
Martin Kroeker
6c99c97489 Merge pull request #1509 from ararslan/aa/dragonfly
Add DragonFly to exports/Makefile
2018-04-07 12:06:57 +02:00
Alex Arslan
6a0930560e Add macOS to the Travis testing matrix 2018-04-06 17:53:58 -07:00
Alex Arslan
24f8d5b624 Add DragonFly to exports/Makefile
Its exclusion was an oversight on my part.
2018-04-06 17:30:10 -07:00
Martin Kroeker
77b4dbd53b Merge pull request #1506 from martin-frbg/issue1497
Fix thread races and infinite looping on systems with many cpus
2018-04-05 23:46:36 +02:00
Martin Kroeker
bc4c3bca01 Merge pull request #1507 from martin-frbg/threads_usage
Underline importance of NUM_THREADS setting for BUFFER allocation
2018-04-05 08:54:07 +02:00
Martin Kroeker
6b0a9d135c Merge pull request #1508 from ararslan/aa/wording
Minor changes to wording and formatting in the README
2018-04-05 08:53:38 +02:00
Alex Arslan
137ccd9dd9 Minor changes to wording and formatting in the README
The wording in some places is not grammatically correct. This change
also provides minor adjustments to the Markdown formatting which provide
modest improvements to readability.
2018-04-04 14:30:32 -07:00
Martin Kroeker
84923dedb7 Merge pull request #1505 from ararslan/aa/compiler
Compile with cc rather than gcc whenever possible
2018-04-04 22:45:33 +02:00
Martin Kroeker
8ec28ff461 Remove unguarded use of _Atomic and fix tabbing 2018-04-04 22:40:30 +02:00
Martin Kroeker
ca8ca796d3 Underline importance of NUM_THREADS setting for BUFFER allocation
following augray's suggestion from #1451, and incorporating ashwinyes' comments from #1141 on the importance of NUM_THREADS even for single-threaded builds.
2018-04-04 22:26:51 +02:00
Alex Arslan
8f811a9312 Reinstate macOS logic 2018-04-04 11:41:45 -07:00
Alex Arslan
36a17536ca Compile with cc rather than gcc whenever possible 2018-04-04 11:26:54 -07:00
Martin Kroeker
bb9876db33 Fix thread races and infinite looping on systems with many cpus
On systems with more than 64 cpus, blas_quickdivide will sometimes return zero which creates bogus workloads when used for the stride calculation. This then leads to threads spinning incessantly waiting for a status change that never happens, as seen in #1497.
This patch also fixes several data races that were found by helgrind and/or tsan while debugging the issue.
2018-04-04 18:16:52 +02:00
Martin Kroeker
d636b418af Merge pull request #1504 from ararslan/aa/openbsd
Allow building on OpenBSD
2018-04-04 15:26:46 +02:00
Martin Kroeker
a460c92577 Merge pull request #1501 from martin-frbg/issue875
Add workaround for old gcc and clang versions
2018-04-04 15:26:21 +02:00
Alex Arslan
33f838393c Add OpenBSD and DragonFly to community supported platforms 2018-04-03 16:42:01 -07:00
Alex Arslan
a41d241a0e Add support for DragonFly BSD 2018-04-03 16:39:29 -07:00
Alex Arslan
8da6b6ae52 Allow building on OpenBSD
With this change, OpenBLAS builds and all tests pass on OpenBSD 6.2
using Clang. Tested on x86-64 only, with and without DYNAMIC_ARCH=1.
2018-04-02 10:48:22 -07:00
Martin Kroeker
01c4b82f04 Update memory.c 2018-03-31 22:32:06 +02:00
Martin Kroeker
93db123f7e Update memory.c 2018-03-29 13:13:49 +02:00
Martin Kroeker
752fdb5dd8 Add workaround for old gcc and clang versions
Old gcc and clang do not handle constructor arguments, finally fix #875 as discussed there, using the fedora patch
2018-03-29 11:56:56 +02:00
Martin Kroeker
07ed01e97f Merge pull request #1500 from martin-frbg/issue1474
Correct index variables used in MFlops calculation
2018-03-28 09:15:34 +02:00
Martin Kroeker
35c5a32309 Correct index variables used in MFlops calculation
Fixes #1474
2018-03-27 21:52:29 +02:00
Martin Kroeker
c7b55b6082 Merge pull request #1499 from quickwritereader/develop
Implemented missing vsx simd  kernels for power8 blas1/2 double. z13 modifications
2018-03-27 21:43:23 +02:00
Martin Kroeker
840e01061f Merge pull request #1491 from martin-frbg/ddot_mt
Add multithreading support for Haswell DDOT
2018-03-27 21:43:05 +02:00
QWR QWR
28ca97015d power8:Added initial zgemv_(t|n) ,i(d|z)amax,i(d|z)amin,dgemv_t(transposed),zrot
z13: improved zgemv_(t|n)_4,zscal,zaxpy
2018-03-27 14:54:41 +00:00
Martin Kroeker
73c5ca74fa Merge pull request #1495 from martin-frbg/aff
Disable CPU affinity by default again
2018-03-19 18:03:25 +01:00
Martin Kroeker
e453555d97 Disable CPU affinity by default again
This setting must have been changed unintentionally by my PR #1214 (probably leftover from unrelated tests)
2018-03-19 18:02:23 +01:00
Martin Kroeker
6a6ffaff1e Merge pull request #1494 from martin-frbg/x86_dsdot
Use generic/dot.c instead of the inferior arm/dot.c for x86 DSDOT
2018-03-17 15:26:47 +01:00
Martin Kroeker
28ac9ea5a6 Use generic/dot.c instead of the inferior arm/dot.c for x86 DSDOT
to resolve dsdot utest failure seen in #1492
2018-03-17 13:49:15 +01:00
Martin Kroeker
a55694dd5b Declare dot_compute static to avoid conflicts in multiarch builds 2018-03-16 22:23:36 +01:00
Martin Kroeker
85a41e9cdb Add multithreading support for Haswell DDOT
copied from ashwinyes' implementation in dot_thunderx2t99.c
2018-03-16 16:58:47 +01:00
Martin Kroeker
40160ff3c1 Use _Atomic instead of volatile for thread safety where C11 is supported 2018-03-10 00:15:44 +01:00
Martin Kroeker
6a99fcce94 Use _Atomic instead of volatile for thread safety where C11 is supported
Suggested by dodomorandi in #660
2018-03-10 00:03:49 +01:00
Martin Kroeker
2c7392f07b Merge pull request #1482 from martin-frbg/haswell_axpy
Re-enable DAXPY AVX microkernels  for x86_64
2018-03-04 22:21:18 +01:00
Martin Kroeker
81215711a2 Re-enable DAXPY microkernels for x86_64
as the inaccuracies seen in the original testcase for #1332 appear to be due to an artefact that amplifies the very small rounding differences between FMA and discrete multiply+add
2018-03-04 19:37:03 +01:00
Martin Kroeker
809fd0d451 Rewrite ROTMG to address cases not covered by the netlib algorithm (#1480)
* Rewrite ROTMG based on the new implementation in GONUM based on the algorithm proposed by Tim Hopkins, see issue 1452 for the reference
* Correct ROTMG utest for issue1452 and add another from gonum, also correct transposition of expected and observed values in error messages
2018-03-04 17:39:56 +01:00
Martin Kroeker
72e65157df Merge pull request #1481 from martin-frbg/utest-fixup
Fix transposition of expected and computed values in error message
2018-03-03 22:43:56 +01:00
Martin Kroeker
69a8aa6de2 Fix transposition of expected and computed values in error message 2018-03-03 18:01:51 +01:00
Martin Kroeker
0ab5bf1746 Merge pull request #1476 from xsacha/patch-1
Fix CMake cross-compiling
2018-02-28 18:47:57 +01:00
Martin Kroeker
22167170b3 Merge pull request #1477 from quickwritereader/develop
Power8 blas3 copy-pack routines
2018-02-28 18:46:54 +01:00
Martin Kroeker
69d9f36ff4 Merge pull request #1468 from martin-frbg/martin-frbg-patch-1
Limit the additional locking from PRs 1052,1299 to non-OpenMP cases
2018-02-28 18:40:31 +01:00
Sacha
f81815e48a Fix CMake cross-compiling
Without specifying thread count, NUM_THREADS would not be defined and CMake would fail.
This is because core count cannot be determined when cross-compiling.
2018-02-28 10:25:25 +10:00
Martin Kroeker
5f855d965d Merge pull request #1475 from ashwinyes/develop_20180227_utest_dsdot_fixes
ARM64: Fix utest dsdot errors
2018-02-27 14:04:16 +01:00
Ashwin Sekhar T K
fa9ca65c0e ARM64: Fix utest dsdot errors 2018-02-27 10:47:55 +00:00
Martin Kroeker
719b68f077 Merge pull request #1473 from martin-frbg/p2align
Replace .align with .p2aligns in dscal.c and the Nehalem microkernels as well
2018-02-27 08:28:20 +01:00
Martin Kroeker
fe9f15f2d8 Merge pull request #1472 from martin-frbg/utest-fixes
Fix limited DSDOT precision on arm,aarch64 and zarch
2018-02-26 22:48:07 +01:00
Martin Kroeker
497f0c3d8a Replace .align with .p2align in the Nehalem microkernels 2018-02-26 20:58:33 +01:00
Martin Kroeker
ea37db828e Convert .align to .p2align for OSX compatibility 2018-02-26 20:48:03 +01:00
Martin Kroeker
e6a0a3de73 Merge pull request #1471 from martin-frbg/p2align
Use .p2align instead of .align for portability on Haswell and Sandybridge
2018-02-26 12:28:01 +01:00
Martin Kroeker
6e70287776 Use generic/dot.c for DSDOT on ARMV5 and above
The default arm/dot.c is less precise when used for DSDOT, as shown by utest
2018-02-25 19:57:23 +01:00
Martin Kroeker
58f236ad73 Use generic/dot.c for DSDOT on zarch 2018-02-25 19:52:14 +01:00
Martin Kroeker
e207107150 Use generic/dot.c for DSDOT on z13
The implementation in arm/dot.c has lower precision, as shown by the utest for dsdot.
2018-02-25 19:51:25 +01:00
Martin Kroeker
c9d408064a Use dot.S also for DSDOT on CORTEXA57 2018-02-25 19:48:09 +01:00
Martin Kroeker
288d1a3f6e Use dot.S also for DSDOT on ARMV8 2018-02-25 19:45:16 +01:00
Martin Kroeker
7c1925acec Use .p2align instead of .align for compatibility on Sandybridge as well 2018-02-24 19:43:15 +01:00
Martin Kroeker
2359c7c1a9 Use .p2align instead of .align for portability
The OSX assembler apparently mishandles the argument to decimal .align, leading to a significant loss of performance 
as observed in #730, #901 and most recently #1470
2018-02-24 17:50:13 +01:00
Martin Kroeker
7646974227 Limit the additional locking from PRs 1052,1299 to non-OpenMP multithreading 2018-02-21 11:45:33 +01:00
Martin Kroeker
e3a80e6aa8 Merge pull request #1466 from xianyi/revert-1464-issue1461
Revert "Add locks only for non-OPENMP multithreading"
2018-02-20 17:17:38 +01:00
the mslm
2c0a008281 dgemm_ncopy_4_ save/restore 2018-02-18 01:30:17 +00:00
the mslm
c5425daa6b power8 ?gemm_tcopy save/restore 2018-02-16 23:36:46 +00:00
280 changed files with 20802 additions and 3024 deletions

View File

@@ -7,6 +7,7 @@ language: c
jobs:
include:
- &test-ubuntu
os: linux
stage: test
compiler: gcc
addons:
@@ -57,7 +58,8 @@ jobs:
- TARGET_BOX=LINUX32
- BTYPE="BINARY=32"
- stage: test
- os: linux
stage: test
compiler: gcc
addons:
apt:
@@ -77,6 +79,7 @@ jobs:
# which is slower than container-based infrastructure used for jobs
# that don't require sudo.
- &test-alpine
os: linux
stage: test
dist: trusty
sudo: true
@@ -120,6 +123,7 @@ jobs:
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
- &test-cmake
os: linux
stage: test
compiler: clang
addons:
@@ -147,6 +151,23 @@ jobs:
env:
- CMAKE=1
- &test-macos
os: osx
stage: test
osx_image: xcode8
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
- brew install gcc # for gfortran
script:
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
env:
- BTYPE="BINARY=64 INTERFACE64=1"
- <<: *test-macos
env:
- BTYPE="BINARY=32"
# whitelist
branches:
only:

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 0.dev)
set(OpenBLAS_PATCH_VERSION 1.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
@@ -298,9 +294,10 @@ endif
lapack-test :
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
ifneq ($(CROSS), 1)
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
./testsecond; ./testdsecnd; ./testieee; ./testversion )
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
endif
@@ -312,9 +309,9 @@ lapack-runtest:
blas-test:
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
dummy :

View File

@@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
ifeq ($(OSNAME), FreeBSD)
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
endif
ifeq ($(OSNAME), NetBSD)
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so
@@ -101,8 +96,9 @@ endif
#Generating openblas.pc
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
@@ -115,7 +111,7 @@ endif
ifndef NO_SHARED
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
endif
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))

View File

@@ -17,6 +17,10 @@ ifdef CPUIDEMU
EXFLAGS = -DCPUIDEMU -DVENDOR=99
endif
ifeq ($(TARGET), 1004K)
TARGET_FLAGS = -mips32r2
endif
ifeq ($(TARGET), P5600)
TARGET_FLAGS = -mips32r5
endif

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.0.dev
VERSION = 0.3.1.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
# automatically detected by the the script.
# NUM_THREADS = 24
# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2
# if you don't need to install the static library, please comment it in.
# NO_STATIC = 1
@@ -100,7 +107,7 @@ BUILD_LAPACK_DEPRECATED = 1
NO_WARMUP = 1
# If you want to disable CPU/Memory affinity on Linux.
#NO_AFFINITY = 1
NO_AFFINITY = 1
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1

View File

@@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
ifeq ($(origin CC),default)
# Check if $(CC) refers to a valid command and set the value to gcc if not
ifneq ($(findstring cmd.exe,$(SHELL)),)
ifeq ($(shell where $(CC) 2>NUL),)
CC = gcc
# Change the default compile to clang on Mac OSX.
# http://stackoverflow.com/questions/714100/os-detecting-makefile
UNAME_S := $(shell uname -s)
ifeq ($(UNAME_S),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
endif
endif
else # POSIX-ish
ifeq ($(shell command -v $(CC) 2>/dev/null),)
ifeq ($(shell uname -s),Darwin)
CC = clang
# EXTRALIB += -Wl,-no_compact_unwind
else
CC = gcc
endif # Darwin
endif # CC exists
endif # Shell is sane
endif # CC is set to default
# Default Fortran compiler (FC) is selected by f_check.
@@ -53,6 +62,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -86,6 +98,9 @@ ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SKYLAKEX)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
@@ -132,6 +147,10 @@ ifeq ($(NO_AVX2), 1)
GETARCH_FLAGS += -DNO_AVX2
endif
ifeq ($(NO_AVX512), 1)
GETARCH_FLAGS += -DNO_AVX512
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@@ -175,6 +194,10 @@ endif
endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
endif
ifndef NUM_THREADS
NUM_THREADS = $(NUM_CORES)
endif
@@ -230,7 +253,7 @@ endif
MD5SUM = md5 -r
endif
ifeq ($(OSNAME), FreeBSD)
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
MD5SUM = md5 -r
endif
@@ -424,7 +447,7 @@ CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), INTEL)
CCOMMON_OPT += -openmp
CCOMMON_OPT += -fopenmp
endif
ifeq ($(C_COMPILER), PGI)
@@ -456,6 +479,11 @@ endif
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += HASWELL ZEN
endif
ifneq ($(NO_AVX512), 1)
ifneq ($(NO_AVX2), 1)
DYNAMIC_CORE += SKYLAKEX
endif
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@@ -555,9 +583,14 @@ CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), 1004K)
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
endif
ifeq ($(CORE), P5600)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
endif
ifeq ($(CORE), I6400)
@@ -704,7 +737,7 @@ FCOMMON_OPT += -i8
endif
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
FCOMMON_OPT += -fopenmp
endif
endif
@@ -906,6 +939,10 @@ ifeq ($(NO_AVX2), 1)
CCOMMON_OPT += -DNO_AVX2
endif
ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@@ -952,6 +989,8 @@ endif
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
ifdef USE_SIMPLE_THREADED_LEVEL3
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
endif
@@ -1210,6 +1249,7 @@ export MSA_FLAGS
export KERNELDIR
export FUNCTION_PROFILE
export TARGET_CORE
export NO_AVX512
export SGEMM_UNROLL_M
export SGEMM_UNROLL_N

230
README.md
View File

@@ -5,175 +5,219 @@
Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=develop)](https://travis-ci.org/xianyi/OpenBLAS)
AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
## Binary Packages
We provide binary packages for the following platform.
We provide official binary packages for the following platform:
* Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
## Installation from Source
Download from project homepage. http://xianyi.github.com/OpenBLAS/
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
using Git from https://github.com/xianyi/OpenBLAS.git.
### Dependencies
Building OpenBLAS requires the following to be installed:
* GNU Make
* A C compiler, e.g. GCC or Clang
* A Fortran compiler (optional, for LAPACK)
* IBM MASS (optional, see below)
### Normal compile
* type "make" to detect the CPU automatically.
or
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
The full target list is in the file `TargetList.txt`.
### Cross compile
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
The target must be specified explicitly when cross compiling.
Examples:
On X86 box, compile this library for loongson3a CPU.
* On an x86 box, compile this library for a loongson3a CPU:
```sh
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
```
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
```sh
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
```
### Debug version
make DEBUG=1
A debug version can be built using `make DEBUG=1`.
### Compile with MASS Support on Power CPU (Optional dependency)
### Compile with MASS support on Power CPU (optional)
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
The library can be installed as below -
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
consists of a set of mathematical functions for C, C++, and Fortran applications that are
are tuned for optimum performance on POWER architectures.
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
The library can be installed as shown:
* On Ubuntu:
* On Ubuntu:
```sh
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
sudo apt-get update
sudo apt-get install libxlmass-devel.8.1.5
```
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
sudo apt-get update</br>
sudo apt-get install libxlmass-devel.8.1.5</br>
* On RHEL/CentOS:
```sh
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
sudo rpm --import repomd.xml.key
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
sudo yum install libxlmass-devel.8.1.5
```
* On RHEL/CentOS:
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
sudo rpm --import repomd.xml.key</br>
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
sudo yum install libxlmass-devel.8.1.5</br>
### Install to a specific directory (optional)
After installing MASS library, compile openblas with USE_MASS=1.
Use `PREFIX=` when invoking `make`, for example
Example:
```sh
make install PREFIX=your_installation_directory
```
Compiling on Power8 with MASS support -
The default installation directory is `/opt/OpenBLAS`.
make USE_MASS=1 TARGET=POWER8
## Supported CPUs and Operating Systems
### Install to the directory (optional)
Please read `GotoBLAS_01Readme.txt`.
Example:
### Additional supported CPUs
make install PREFIX=your_installation_directory
#### x86/x86-64
The default directory is /opt/OpenBLAS
## Support CPU & OS
Please read GotoBLAS_01Readme.txt
### Additional support CPU:
#### x86/x86-64:
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
#### MIPS64:
#### MIPS64
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
- **ICT Loongson 3B**: Experimental
#### ARM:
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
#### ARM
#### ARM64:
- **ARMV8**: Experimental
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
#### ARM64
- **ARMv8**: Experimental
- **ARM Cortex-A57**: Experimental
#### PPC/PPC64
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
#### IBM zEnterprise System:
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
#### IBM zEnterprise System
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
### Support OS:
### Supported OS
- **GNU/Linux**
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
## Usages
Link with libopenblas.a or -lopenblas for shared library.
## Usage
### Set the number of threads with environment variables.
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
compiled as a shared library.
Examples:
### Setting the number of threads using environment variables
export OPENBLAS_NUM_THREADS=4
Environment variables are used to specify a maximum number of threads.
For example,
or
```sh
export OPENBLAS_NUM_THREADS=4
export GOTO_NUM_THREADS=4
export OMP_NUM_THREADS=4
```
export GOTO_NUM_THREADS=4
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
or
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
compiled with `USE_OPENMP=1`.
export OMP_NUM_THREADS=4
### Setting the number of threads at runtime
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
We provide the following functions to control the number of threads at runtime:
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
```c
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
```
### Set the number of threads on runtime.
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
We provided the below functions to control the number of threads on runtime.
## Reporting bugs
void goto_set_num_threads(int num_threads);
void openblas_set_num_threads(int num_threads);
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
## Report Bugs
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
## Contact
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
## ChangeLog
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
## Change log
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
## Troubleshooting
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
Clang 3.0 will generate the wrong AVX binary code.
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
the library with `BIGNUMA=1`.
* OpenBLAS does not set processor affinity by default.
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
Makefile.rule. However, note that this may cause
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
However, it will be okay when you run the same test case on the shell.
## Contributing
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
1. Write a test which shows that the bug was fixed or that the feature works as expected.
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
to start a discussion around a feature idea or a bug.
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
3. Write a test which shows that the bug was fixed or that the feature works as expected.
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
## Donation
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).

View File

@@ -20,6 +20,7 @@ DUNNINGTON
NEHALEM
SANDYBRIDGE
HASWELL
SKYLAKEX
ATOM
b)AMD CPU:
@@ -56,6 +57,7 @@ CELL
3.MIPS CPU:
P5600
1004K
4.MIPS64 CPU:
SICORTEX

View File

@@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
`MAX_CPU_NUMBER=NUM_THREADS`.
Despite its name, and due to the use of memory buffers in functions like SGEMM,
the setting of NUM_THREADS can be relevant even for a single-threaded build
of OpenBLAS, if such functions get called by multiple threads of a program
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
a segmentation fault without displaying the above warning first.
Note that the number of threads used at runtime can be altered to differ from the
value NUM_THREADS was set to at build time. At runtime, the actual number of
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
that this does not change the number of memory buffers that will be allocated,
which is set at build time). The number of threads for a process can be set by
using the mechanisms described below.
#### How can I use OpenBLAS in multi-threaded applications?
If your application is already multi-threaded, it will conflict with OpenBLAS

View File

@@ -237,7 +237,7 @@ int main(int argc, char *argv[]){
timeg = time1/loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1);
}

18
c_check
View File

@@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
$os = Linux if ($data =~ /OS_LINUX/);
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
$os = NetBSD if ($data =~ /OS_NETBSD/);
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
$os = Darwin if ($data =~ /OS_DARWIN/);
$os = SunOS if ($data =~ /OS_SUNOS/);
$os = AIX if ($data =~ /OS_AIX/);
@@ -199,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);
$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
$args = " -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
}
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
$data =~ /globl\s([_\.]*)(.*)/;
@@ -286,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
$os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/;

View File

@@ -56,6 +56,9 @@ if (DYNAMIC_ARCH)
if (NOT NO_AVX2)
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
endif ()
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
endif ()
if (NOT DYNAMIC_CORE)

View File

@@ -1,6 +1,7 @@
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@

View File

@@ -33,7 +33,7 @@ endif ()
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
message(STATUS "Compiling a ${BINARY}-bit binary.")
set(NO_AVX 1)
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
set(TARGET "NEHALEM")
endif ()
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
@@ -96,8 +96,12 @@ if (NOT CMAKE_CROSSCOMPILING)
endif()
if (NOT DEFINED NUM_PARALLEL)
set(NUM_PARALLEL 1)
endif()
if (NOT DEFINED NUM_THREADS)
if (NOT NUM_CORES EQUAL 0)
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
# HT?
set(NUM_THREADS ${NUM_CORES})
else ()
@@ -224,6 +228,8 @@ endif ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
if (USE_SIMPLE_THREADED_LEVEL3)
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
endif ()

View File

@@ -66,3 +66,12 @@ else()
set(BINARY32 1)
endif()
if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()
file(REMOVE "avx512.tmp" "avx512.o")
endif()

View File

@@ -93,7 +93,7 @@ extern "C" {
#include <sched.h>
#endif
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
#include <sched.h>
#endif
@@ -179,7 +179,7 @@ extern "C" {
#define ALLOCA_ALIGN 63UL
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
#ifdef NEEDBUNDERSCORE
#define BLASFUNC(FUNC) FUNC##_
@@ -642,6 +642,7 @@ void gotoblas_profile_init(void);
void gotoblas_profile_quit(void);
#ifdef USE_OPENMP
#ifndef C_MSVC
int omp_in_parallel(void);
int omp_get_num_procs(void);
@@ -649,6 +650,21 @@ int omp_get_num_procs(void);
__declspec(dllimport) int __cdecl omp_in_parallel(void);
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
#endif
#if (__STDC_VERSION__ >= 201112L)
#if defined(C_GCC) && ( __GNUC__ < 7)
// workaround for GCC bug 65467
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#include <stdatomic.h>
#else
#ifndef _Atomic
#define _Atomic volatile
#endif
#endif
#else
#ifdef __ELF__
int omp_in_parallel (void) __attribute__ ((weak));

View File

@@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
result = x/y;
return result;
#else
#if (MAX_CPU_NUMBER > 64)
if ( y > 64) {
result = x/y;
return result;
}
#endif
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@@ -327,7 +333,7 @@ REALNAME:
#endif
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
#define PROLOGUE \
.text; \
.align 16; \

View File

@@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
if (y <= 1) return x;
#if (MAX_CPU_NUMBER > 64)
if (y > 64) {
result = x / y;
return result;
}
#endif
y = blas_quick_divide_table[y];
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
@@ -403,7 +410,7 @@ REALNAME:
#define EPILOGUE .end
#endif
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
#define PROLOGUE \
.text; \
.align 512; \

View File

@@ -115,6 +115,7 @@
#define CORE_STEAMROLLER 25
#define CORE_EXCAVATOR 26
#define CORE_ZEN 27
#define CORE_SKYLAKEX 28
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@@ -137,6 +138,7 @@
#define HAVE_AVX (1 << 18)
#define HAVE_FMA4 (1 << 19)
#define HAVE_FMA3 (1 << 20)
#define HAVE_AVX512VL (1 << 21)
#define CACHE_INFO_L1_I 1
#define CACHE_INFO_L1_D 2
@@ -211,5 +213,6 @@ typedef struct {
#define CPUTYPE_STEAMROLLER 49
#define CPUTYPE_EXCAVATOR 50
#define CPUTYPE_ZEN 51
#define CPUTYPE_SKYLAKEX 52
#endif

View File

@@ -121,7 +121,7 @@ int detect(void)
return CPU_VULCAN;
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX;
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
return CPU_THUNDERX2T99;
}

View File

@@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_UNKNOWN 0
#define CPU_P5600 1
#define CPU_1004K 2
static char *cpuname[] = {
"UNKOWN",
"P5600"
"P5600",
"1004K"
};
int detect(void){
@@ -90,7 +92,7 @@ int detect(void){
if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
fprintf(stderr, "%s \n", p);
#endif
break;
}
@@ -99,43 +101,13 @@ int detect(void){
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
if (strstr(p, "5600")) {
return CPU_P5600;
} else if (strstr(p, "1004K")) {
return CPU_1004K;
} else
return CPU_UNKNOWN;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
}
#endif
return CPU_UNKNOWN;
}
@@ -149,7 +121,7 @@ void get_architecture(void){
}
void get_subarchitecture(void){
if(detect()==CPU_P5600){
if(detect()==CPU_P5600|| detect()==CPU_1004K){
printf("P5600");
}else{
printf("UNKNOWN");
@@ -170,6 +142,14 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
} else if (detect()==CPU_1004K) {
printf("#define MIPS1004K\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 26144\n");
printf("#define DTB_DEFAULT_ENTRIES 8\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else{
printf("#define UNKNOWN\n");
}
@@ -178,6 +158,8 @@ void get_cpuconfig(void){
void get_libname(void){
if(detect()==CPU_P5600) {
printf("p5600\n");
} else if (detect()==CPU_1004K) {
printf("1004K\n");
}else{
printf("mips\n");
}

View File

@@ -50,6 +50,8 @@
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
#define CORE_SKYLAKEX CORE_NEHALEM
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
#define CORE_SANDYBRIDGE CORE_NEHALEM
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
@@ -1299,6 +1301,19 @@ int get_cpuname(void){
else
return CPUTYPE_NEHALEM;
case 5:
// Skylake X
#ifndef NO_AVX512
return CPUTYPE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CPUTYPE_HASWELL;
#else
return CPUTYPE_SANDYBRIDGE;
#endif
else
return CPUTYPE_NEHALEM;
#endif
case 14:
// Skylake
if(support_avx())
@@ -1556,6 +1571,7 @@ static char *cpuname[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX"
};
static char *lowercpuname[] = {
@@ -1610,6 +1626,7 @@ static char *lowercpuname[] = {
"steamroller",
"excavator",
"zen",
"skylakex"
};
static char *corename[] = {
@@ -1641,6 +1658,7 @@ static char *corename[] = {
"STEAMROLLER",
"EXCAVATOR",
"ZEN",
"SKYLAKEX"
};
static char *corename_lower[] = {
@@ -1672,6 +1690,7 @@ static char *corename_lower[] = {
"steamroller",
"excavator",
"zen",
"skylakex"
};
@@ -1860,6 +1879,19 @@ int get_coretype(void){
else
return CORE_NEHALEM;
case 5:
// Skylake X
#ifndef NO_AVX512
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
case 14:
// Skylake
if(support_avx())

View File

@@ -60,6 +60,14 @@ OS_FREEBSD
OS_NETBSD
#endif
#if defined(__OpenBSD__)
OS_OPENBSD
#endif
#if defined(__DragonFly__)
OS_DRAGONFLY
#endif
#if defined(__sun)
OS_SUNOS
#endif

View File

@@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
@@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)

View File

@@ -91,7 +91,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@@ -67,7 +67,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@@ -91,7 +91,12 @@
#endif
typedef struct {
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
#if __STDC_VERSION__ >= 201112L
_Atomic
#else
volatile
#endif
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
} job_t;

View File

@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/*********************************************************************/
#include "common.h"
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
#include <dlfcn.h>
#include <signal.h>
#include <sys/resource.h>

View File

@@ -36,6 +36,7 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
//#include <sys/mman.h>
@@ -49,11 +50,16 @@
int blas_server_avail = 0;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#if __STDC_VERSION__ >= 201112L
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#else
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
#endif
void goto_set_num_threads(int num_threads) {
int i=0;
int i=0, j=0;
if (num_threads < 1) num_threads = blas_num_threads;
@@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
omp_set_num_threads(blas_cpu_number);
//adjust buffer for each thread
for(i=0; i<blas_cpu_number; i++){
if(blas_thread_buffer[i]==NULL){
blas_thread_buffer[i]=blas_memory_alloc(2);
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<blas_cpu_number; j++){
if(blas_thread_buffer[i][j]==NULL){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
}
}
for(; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
for(; j<MAX_CPU_NUMBER; j++){
if(blas_thread_buffer[i][j]!=NULL){
blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
}
}
#if defined(ARCH_MIPS64)
@@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
int blas_thread_init(void){
int i=0;
int i=0, j=0;
blas_get_cpu_number();
blas_server_avail = 1;
for(i=0; i<blas_num_threads; i++){
blas_thread_buffer[i]=blas_memory_alloc(2);
}
for(; i<MAX_CPU_NUMBER; i++){
blas_thread_buffer[i]=NULL;
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<blas_num_threads; j++){
blas_thread_buffer[i][j]=blas_memory_alloc(2);
}
for(; j<MAX_CPU_NUMBER; j++){
blas_thread_buffer[i][j]=NULL;
}
}
return 0;
}
int BLASFUNC(blas_thread_shutdown)(void){
int i=0;
int i=0, j=0;
blas_server_avail = 0;
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
for(j=0; j<MAX_CPU_NUMBER; j++){
if(blas_thread_buffer[i][j]!=NULL){
blas_memory_free(blas_thread_buffer[i][j]);
blas_thread_buffer[i][j]=NULL;
}
}
}
@@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
}
}
static void exec_threads(blas_queue_t *queue){
static void exec_threads(blas_queue_t *queue, int buf_index){
void *buffer, *sa, *sb;
int pos=0, release_flag=0;
@@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
pos = omp_get_thread_num();
buffer = blas_thread_buffer[pos];
buffer = blas_thread_buffer[buf_index][pos];
//fallback
if(buffer==NULL) {
@@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
int exec_blas(BLASLONG num, blas_queue_t *queue){
BLASLONG i;
BLASLONG i, buf_index;
if ((num <= 0) || (queue == NULL)) return 0;
@@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
}
#endif
while(true) {
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
#if __STDC_VERSION__ >= 201112L
_Bool inuse = false;
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
#else
if(blas_buffer_inuse[i] == false) {
blas_buffer_inuse[i] = true;
#endif
buf_index = i;
break;
}
}
if(i != MAX_PARALLEL_NUMBER)
break;
}
#pragma omp parallel for schedule(static)
for (i = 0; i < num; i ++) {
@@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
queue[i].position = i;
#endif
exec_threads(&queue[i]);
exec_threads(&queue[i], buf_index);
}
#if __STDC_VERSION__ >= 201112L
atomic_store(&blas_buffer_inuse[buf_index], false);
#else
blas_buffer_inuse[buf_index] = false;
#endif
return 0;
}

View File

@@ -74,15 +74,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
@@ -284,8 +291,21 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 5) {
// Intel Skylake X
#ifndef NO_AVX512
return &gotoblas_SKYLAKEX;
#else
if(support_avx())
return &gotoblas_HASWELL;
else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
#endif
}
//Intel Skylake
if (model == 14 || model == 5) {
if (model == 14) {
if(support_avx())
return &gotoblas_HASWELL;
else{
@@ -445,7 +465,8 @@ static char *corename[] = {
"Haswell",
"Steamroller",
"Excavator",
"Zen"
"Zen",
"SkylakeX"
};
char *gotoblas_corename(void) {
@@ -473,7 +494,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0];
}
@@ -485,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 23; i++)
for ( i=1 ; i <= 24; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
@@ -503,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found)
{
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);

View File

@@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/resource.h>
#endif
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif
#ifdef DYNAMIC_ARCH
@@ -177,7 +180,7 @@ int get_num_procs(void) {
cpu_set_t *cpusetp;
size_t size;
int ret;
// int i,n;
int i,n;
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
@@ -209,7 +212,8 @@ int ret;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
#endif
@@ -246,7 +250,7 @@ int get_num_procs(void) {
#endif
#if defined(OS_FREEBSD)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
int get_num_procs(void) {
@@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif
@@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -455,11 +459,15 @@ static void *alloc_mmap(void *address){
}
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
#ifdef OS_LINUX
@@ -601,14 +609,18 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);
if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
UNLOCK_COMMAND(&alloc_lock);
return map_address;
}
@@ -1007,6 +1019,11 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);
#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif
LOCK_COMMAND(&alloc_lock);
if (!memory_initialized) {
@@ -1042,6 +1059,9 @@ void *blas_memory_alloc(int procpos){
}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif
#ifdef DEBUG
printf("Alloc Start ...\n");
@@ -1056,13 +1076,17 @@ void *blas_memory_alloc(int procpos){
do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
/* blas_lock(&memory[position].lock);*/
#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
#else
blas_unlock(&memory[position].lock);
#endif
}
position ++;
@@ -1075,15 +1099,19 @@ void *blas_memory_alloc(int procpos){
position = 0;
do {
/* if (!memory[position].used) { */
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
/* blas_lock(&memory[position].lock);*/
#else
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
/* } */
#else
blas_unlock(&memory[position].lock);
}
#endif
position ++;
@@ -1098,9 +1126,11 @@ void *blas_memory_alloc(int procpos){
#endif
memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/
#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
@@ -1146,9 +1176,13 @@ void *blas_memory_alloc(int procpos){
} while ((BLASLONG)map_address == -1);
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@@ -1202,8 +1236,9 @@ void blas_memory_free(void *free_area){
#endif
position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;
@@ -1217,7 +1252,9 @@ void blas_memory_free(void *free_area){
WMB;
memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@@ -1232,8 +1269,9 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}

View File

@@ -54,6 +54,9 @@ static char* openblas_config_str=""
#ifdef NO_AFFINITY
"NO_AFFINITY "
#endif
#ifdef USE_OPENMP
"USE_OPENMP "
#endif
#ifndef DYNAMIC_ARCH
CHAR_CORENAME
#endif
@@ -61,18 +64,23 @@ static char* openblas_config_str=""
#ifdef DYNAMIC_ARCH
char *gotoblas_corename();
static char tmp_config_str[256];
#endif
static char tmp_config_str[256];
int openblas_get_parallel();
char* CNAME() {
#ifndef DYNAMIC_ARCH
return openblas_config_str;
#else
char tmpstr[20];
strcpy(tmp_config_str, openblas_config_str);
#ifdef DYNAMIC_ARCH
strcat(tmp_config_str, gotoblas_corename());
return tmp_config_str;
#endif
if (openblas_get_parallel() == 0)
sprintf(tmpstr, " SINGLE_THREADED");
else
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
strcat(tmp_config_str, tmpstr);
return tmp_config_str;
}
@@ -83,3 +91,4 @@ char* openblas_get_corename() {
return gotoblas_corename();
#endif
}

View File

@@ -167,7 +167,7 @@ int get_L2_size(void){
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
@@ -251,7 +251,7 @@ int get_L2_size(void){
void blas_set_parameter(void){
int factor;
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
int size = 16;
#else
int size = get_L2_size();

View File

@@ -128,6 +128,8 @@ so : ../$(LIBSONAME)
ifeq ($(OSNAME), Android)
INTERNALNAME = $(LIBPREFIX).so
FEXTRALIB += -lm
EXTRALIB += -lm
else
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
endif
@@ -156,7 +158,7 @@ endif
endif
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
so : ../$(LIBSONAME)

View File

@@ -97,7 +97,7 @@ if ($compiler eq "") {
if ($data =~ /Intel/) {
$vendor = INTEL;
$openmp = "-openmp";
$openmp = "-fopenmp";
}
if ($data =~ /Sun Fortran/) {
@@ -127,7 +127,7 @@ if ($compiler eq "") {
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data =~ /zho_ge__/) {
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
}
@@ -155,7 +155,7 @@ if ($compiler eq "") {
if ($compiler =~ /ifort/) {
$vendor = INTEL;
$bu = "_";
$openmp = "-openmp";
$openmp = "-fopenmp";
}
if ($compiler =~ /pathf/) {

View File

@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef OS_WINDOWS
#include <windows.h>
#endif
#if defined(__FreeBSD__) || defined(__APPLE__)
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
#include <sys/types.h>
#include <sys/sysctl.h>
#endif
@@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "HASWELL"
#endif
#ifdef FORCE_SKYLAKEX
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "SKYLAKEX"
#define ARCHCONFIG "-DSKYLAKEX " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#ifdef FORCE_ATOM
#define FORCE
#define FORCE_INTEL
@@ -1074,7 +1089,7 @@ static int get_num_cores(void) {
#ifdef OS_WINDOWS
SYSTEM_INFO sysinfo;
#elif defined(__FreeBSD__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
int m[2], count;
size_t len;
#endif
@@ -1088,7 +1103,7 @@ static int get_num_cores(void) {
GetSystemInfo(&sysinfo);
return sysinfo.dwNumberOfProcessors;
#elif defined(__FreeBSD__) || defined(__APPLE__)
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
m[0] = CTL_HW;
m[1] = HW_NCPU;
len = sizeof(int);
@@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){
#elif NO_PARALLEL_MAKE==1
printf("MAKE += -j 1\n");
#else
#ifndef OS_WINDOWS
printf("MAKE += -j %d\n", get_num_cores());
#endif
#endif
break;

View File

@@ -44,6 +44,7 @@
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE
#define ERROR_NAME "QGEMM "
#elif defined(DOUBLE)
@@ -52,6 +53,7 @@
#define ERROR_NAME "SGEMM "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifndef GEMM3M
#ifdef XDOUBLE
#define ERROR_NAME "XGEMM "
@@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
FLOAT *sa, *sb;
#ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK;
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *sa, *sb;
#ifdef SMP
int nthreads_max;
int nthreads_avail;
double MNK;
#ifndef COMPLEX
#ifdef XDOUBLE
@@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
nthreads_max = num_cpu_avail(3);
nthreads_avail = nthreads_max;
#ifndef COMPLEX
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#else
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
#endif
args.common = NULL;
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else
args.nthreads = nthreads_max;
args.nthreads = num_cpu_avail(3);
args.common = NULL;
if (args.nthreads == 1) {
#endif

View File

@@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
if (*dd2 == ZERO || dy1 == ZERO)
{
dflag = -TWO;
dparam[0] = dflag;
return;
}
if(*dd1 < ZERO)
{
dflag = -ONE;
@@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
*dd2 = ZERO;
*dx1 = ZERO;
}
else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO)
{
dflag = ONE;
dh12 = 1;
dh21 = -1;
*dx1 = dy1;
dtemp = *dd1;
*dd1 = *dd2;
*dd2 = dtemp;
}
else
{
dp2 = *dd2 * dy1;
@@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq1 = dp1 * *dx1;
if(ABS(dq1) > ABS(dq2))
{
dflag = ZERO;
dh11 = ONE;
dh22 = ONE;
dh21 = - dy1 / *dx1;
dh12 = dp2 / dp1;
@@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
*dd1 = *dd1 / du;
*dd2 = *dd2 / du;
*dx1 = *dx1 * du;
} else {
dflag = -ONE;
dh11 = ZERO;
dh12 = ZERO;
dh21 = ZERO;
dh22 = ZERO;
*dd1 = ZERO;
*dd2 = ZERO;
*dx1 = ZERO;
}
}
else
{
@@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
}
else
{
dflag = ONE;
dflag = ONE;
dh21 = -ONE;
dh12 = ONE;
dh11 = dp1 / dp2;
dh22 = *dx1 / dy1;
@@ -134,76 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
}
if(*dd1 != ZERO)
while ( *dd1 <= RGAMSQ && *dd1 != ZERO)
{
if( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) )
{
if(dflag == ZERO)
{
dh11 = ONE;
dh22 = ONE;
dflag = -ONE;
}
else
{
dh21 = -ONE;
dh12 = ONE;
dflag = -ONE;
}
if( *dd1 <= RGAMSQ )
{
while (ABS(*dd1) <= RGAMSQ) {
*dd1 = *dd1 * (GAM * GAM);
*dx1 = *dx1 / GAM;
dh11 = dh11 / GAM;
dh12 = dh12 / GAM;
}
}
else
{
while (ABS(*dd1) >= GAMSQ) {
*dd1 = *dd1 / (GAM * GAM);
*dx1 = *dx1 * GAM;
dh11 = dh11 * GAM;
dh12 = dh12 * GAM;
}
}
}
dflag = -ONE;
*dd1 = *dd1 * (GAM * GAM);
*dx1 = *dx1 / GAM;
dh11 = dh11 / GAM;
dh12 = dh12 / GAM;
}
while (ABS(*dd1) > GAMSQ) {
dflag = -ONE;
*dd1 = *dd1 / (GAM * GAM);
*dx1 = *dx1 * GAM;
dh11 = dh11 * GAM;
dh12 = dh12 * GAM;
}
if(*dd2 != ZERO)
{
if( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) )
{
if(dflag == ZERO)
{
dh11 = ONE;
dh22 = ONE;
dflag = -ONE;
}
else
{
dh21 = -ONE;
dh12 = ONE;
dflag = -ONE;
}
if( ABS(*dd2) <= RGAMSQ )
{
while (ABS(*dd2) <= RGAMSQ) {
*dd2 = *dd2 * (GAM * GAM);
dh21 = dh21 / GAM;
dh22 = dh22 / GAM;
}
}
else
{
while (ABS(*dd2) >= GAMSQ) {
*dd2 = *dd2 / (GAM * GAM);
dh21 = dh21 * GAM;
dh22 = dh22 * GAM;
}
}
}
while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) {
dflag = -ONE;
*dd2 = *dd2 * (GAM * GAM);
dh21 = dh21 / GAM;
dh22 = dh22 / GAM;
}
while (ABS(*dd2) > GAMSQ) {
dflag = -ONE;
*dd2 = *dd2 / (GAM * GAM);
dh21 = dh21 * GAM;
dh22 = dh22 * GAM;
}
}

View File

@@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT);
args.nthreads = num_cpu_avail(3);
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) {

View File

@@ -41,7 +41,11 @@
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(Z13)
#define MULTI_THREAD_MINIMAL 200000
#else
#define MULTI_THREAD_MINIMAL 10000
#endif
#ifndef CBLAS
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
@@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#endif
#ifndef CBLAS
PRINT_DEBUG_CNAME;
PRINT_DEBUG_NAME;
#else
PRINT_DEBUG_CNAME;
#endif
@@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
if (incx == 0 || incy == 0)
nthreads = 1;
//Work around the low performance issue with small imput size &
//multithreads.
if (n <= MULTI_THREAD_MINIMAL) {
nthreads = 1;
}
if (nthreads == 1) {
#endif

View File

@@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else
nthreads = 1;
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;
if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}

View File

@@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.L3
set(USE_TRMM false)
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen")
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
set(USE_TRMM true)
endif ()

View File

@@ -29,9 +29,11 @@ USE_TRMM = 1
endif
ifeq ($(CORE), HASWELL)
ifeq ($(ARCH), x86_64)
USE_TRMM = 1
endif
ifeq ($(CORE), SKYLAKEX)
USE_TRMM = 1
endif
ifeq ($(CORE), ZEN)

View File

@@ -49,6 +49,7 @@ SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
DSDOTKERNEL = ../generic/dot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c

View File

@@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble axpy_kernel_L999
/*
cmp INC_X, #0
beq axpy_kernel_L999
cmp INC_Y, #0
beq axpy_kernel_L999
*/
cmp INC_X, #1
bne axpy_kernel_S_BEGIN

View File

@@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp N, #0
ble rot_kernel_L999
/*
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
*/
cmp INC_X, #1
bne rot_kernel_S_BEGIN
@@ -584,6 +584,12 @@ rot_kernel_S1:
rot_kernel_S10:
KERNEL_S1
cmp INC_X, #0
beq rot_kernel_L999
cmp INC_Y, #0
beq rot_kernel_L999
subs I, I, #1
bne rot_kernel_S10

View File

@@ -49,6 +49,7 @@ SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S

View File

@@ -29,6 +29,7 @@ SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S

View File

@@ -74,8 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d3, TMPY
fcvt d2, TMPX
fmul d2, d2, d3
fadd DOTF, DOTF, d2
#endif
.endm
@@ -87,12 +88,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DSDOT)
fmla v0.4s, v2.4s, v3.4s
#else
fmul v2.4s, v2.4s, v3.4s
ext v3.16b, v2.16b, v2.16b, #8
fcvtl v2.2d, v2.2s
fcvtl2 v5.2d, v3.4s
fcvtl2 v4.2d, v2.4s
fcvtl v3.2d, v3.2s
fcvtl v2.2d, v2.2s
fmul v4.2d, v4.2d, v5.2d
fmul v2.2d, v2.2d, v3.2d
fadd v2.2d, v2.2d, v4.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [X], #32
@@ -136,8 +139,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d3, TMPY
fcvt d2, TMPX
fmul d2, d2, d3
fadd DOTF, DOTF, d2
#endif
.endm

View File

@@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {
if (X > posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
#ifdef UNIT
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);
b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
#endif
ao1 += 2;
b += 2;
}
#endif
b[ 1] = *(ao1 + 1);
b += 2;
}
posY += 2;
@@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
@@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a13 += i;
a14 += i;
a15 += i;
a16 += i; */
a16 += i;
b += 16 * i;
} else
if (X > posY) {
@@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
a05 += i;
a06 += i;
a07 += i;
a08 += i; */
a08 += i;
b += 8 * i;
} else
if (X > posY) {
@@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 8;
}
/* a02 += i * lda;
a02 += i * lda;
a03 += i * lda;
a04 += i * lda;
a05 += i * lda;
a06 += i * lda;
a07 += i * lda;
a08 += i * lda; */
a08 += i * lda;
} else {
#ifdef UNIT
b[ 0] = ONE;
@@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i; */
a04 += i;
b += 4 * i;
} else
if (X > posY) {
@@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += lda;
b += 4;
}
/* a02 += lda;
a02 += lda;
a03 += lda;
a04 += lda; */
a04 += lda;
} else {
#ifdef UNIT
@@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
a01 ++;
a02 ++;
} else {
#ifdef UNIT
b += 2;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
b[ 1] = *(a01 + 1);
a01 += lda;
b += 2;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
b[ 1] = *(a01 + 1);
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif
b[ 1] = *(a01 + 1);
}
b += 2;
b += 2;
}
}
posY += 2;
}
@@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i > 0) {
do {
if (X < posY) {
a01 ++;
} else {
#ifdef UNIT
a01 += 1;
b ++;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
a01 += lda;
b ++;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
b[ 0] = *(a01 + 0);
#endif
a01 += lda;
}
b ++;
X ++;
i --;
a01 += lda;
b ++;
}
X += 1;
i --;
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {
if (X < posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X > posY) {
@@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = ZERO;
#endif
// ao1 += lda;
ao1 += lda;
b += 2;
}
}
@@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (m > 0) {
do {
if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += lda;
} else {
#ifdef UNIT
if (X > posY) {
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
} else {
b[ 0] = ONE;
}
#endif
b ++;
ao1 += lda;
X ++;
b += 1;
ao1 += lda;
}
X += 1;
i --;
} while (i > 0);
}

View File

@@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
if (m & 2) {
/* ao1 += 2;
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2; */
ao4 += 2;
b += 8;
}
if (m & 1) {
/* ao1 += 1;
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1; */
ao4 += 1;
b += 4;
}
@@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data08;
ao1 += 2 * lda;
// ao2 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}
@@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data03;
b[ 3] = data04;
// ao1 += lda;
ao1 += lda;
b += 4;
}
@@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i) {
if (X < posY) {
// ao1 += 2;
ao1 += 2;
b += 2;
} else
if (X > posY) {
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {
if (X < posY) {
b += 1;
ao1 += 1;
} else {
#ifdef UNIT
} else
if (X > posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
}
b ++;
X ++;
ao1 += lda;
b += 1;
}
X += 1;
i --;
} while (i > 0);
}

View File

@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}
a1 += 2 * lda;
// a2 += 2 * lda;
a2 += 2 * lda;
b += 8;
ii += 2;

View File

@@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
if (m & 1) {
#ifdef UNIT
if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;
} else
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
ao1 += lda;
b += 4;
} else {
#ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = ONE;
b[ 1] = ZERO;
}
b[ 2] = data3;
b[ 3] = data4;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);
b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
#endif
b += 4;
b += 4;
}
}
posY += 2;
@@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}
// posY += 1;
posY += 1;
}
return 0;

View File

@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0;
FLOAT data01, data02;
FLOAT *a1;
lda *= 2;

View File

@@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj;
FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
FLOAT data01, data02, data03, data04;
FLOAT data05, data06, data07, data08;
FLOAT *a1, *a2;
lda *= 2;

1
kernel/mips/KERNEL.1004K Normal file
View File

@@ -0,0 +1 @@
include $(KERNELDIR)/KERNEL.P5600

View File

@@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
else
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/asum.c
ZASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/zasum.c
endif
ifdef HAVE_MSA
@@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
endif

View File

@@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v2f64 v_alpha;
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);

View File

@@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
while(i < n)
{
dot += y[iy] * x[ix] ;
#if defined(DSDOT)
dot += (double)y[iy] * (double)x[ix] ;
#else
dot += y[iy] * x[ix];
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;

View File

@@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);

View File

@@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c

View File

@@ -90,14 +90,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#DMINKERNEL = ../arm/min.c
#
#ISAMAXKERNEL = ../arm/iamax.c
#IDAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = idamax.c
#ICAMAXKERNEL = ../arm/izamax.c
#IZAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = izamax.c
#
#ISAMINKERNEL = ../arm/iamin.c
#IDAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = idamin.c
#ICAMINKERNEL = ../arm/izamin.c
#IZAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
#IDMAXKERNEL = ../arm/imax.c
@@ -133,8 +133,8 @@ ZNRM2KERNEL = ../arm/znrm2.c
#
SROTKERNEL = srot.c
DROTKERNEL = drot.c
#CROTKERNEL = ../arm/zrot.c
#ZROTKERNEL = ../arm/zrot.c
CROTKERNEL = zrot.c
ZROTKERNEL = zrot.c
#
SSCALKERNEL = sscal.c
DSCALKERNEL = dscal.c
@@ -150,12 +150,12 @@ ZSWAPKERNEL = zswap.c
#SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = dgemv_n.c
#CGEMVNKERNEL = ../arm/zgemv_n.c
#ZGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = zgemv_n_4.c
#
#SGEMVTKERNEL = ../arm/gemv_t.c
#DGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = dgemv_t.c
#CGEMVTKERNEL = ../arm/zgemv_t.c
#ZGEMVTKERNEL = zgemv_t_4.c
ZGEMVTKERNEL = zgemv_t_4.c
#SSYMV_U_KERNEL = ../generic/symv_k.c

View File

@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
@@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@@ -109,81 +109,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dgemm_ncopy_macros_4_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
//addi SP, SP, -208
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11,SP,288
stvx v20, r11,r0
addi r11,r11,16
stvx v21, r11,r0
addi r11,r11,16
stvx v22, r11,r0
addi r11,r11,16
stvx v23, r11,r0
addi r11,r11,16
stvx v24, r11,r0
addi r11,r11,16
stvx v25, r11,r0
addi r11,r11,16
stvx v26, r11,r0
addi r11,r11,16
stvx v27, r11,r0
addi r11,r11,16
stvx v28, r11,r0
addi r11,r11,16
stvx v29, r11,r0
addi r11,r11,16
stvx v30, r11,r0
addi r11,r11,16
stvx v31, r11,r0
li r11,0
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
@@ -191,10 +146,8 @@ li r11,0
slwi LDA, LDA, BASE_SHIFT
//li PREA, 384
//li PREB, 384
li PREA, 576
li PREB, 576
li PREA, 384
li PREB, 384
li o8, 8
@@ -210,70 +163,24 @@ li r11,0
L999:
li r3, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11,r3
addi r11,r11,16
lvx v21, r11,r3
addi r11,r11,16
lvx v22, r11,r3
addi r11,r11,16
lvx v23, r11,r3
addi r11,r11,16
lvx v24, r11,r3
addi r11,r11,16
lvx v25, r11,r3
addi r11,r11,16
lvx v26, r11,r3
addi r11,r11,16
lvx v27, r11,r3
addi r11,r11,16
lvx v28, r11,r3
addi r11,r11,16
lvx v29, r11,r3
addi r11,r11,16
lvx v30, r11,r3
addi r11,r11,16
lvx v31, r11,r3
li r11,0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
//addi SP, SP, 208

View File

@@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_4x16
lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1
lxvd2x vs24, o0, A3
lxvd2x vs16, o0, A2
lxvd2x vs1, o0, A1
lxvd2x vs2, o0, A2
lxvd2x vs3, o0, A3
lxvd2x vs1, o16, A0
lxvd2x vs9, o16, A1
lxvd2x vs17, o16, A2
lxvd2x vs25, o16, A3
lxvd2x vs4, o16, A0
lxvd2x vs5, o16, A1
lxvd2x vs6, o16, A2
lxvd2x vs7, o16, A3
lxvd2x vs2, o32, A0
lxvd2x vs10, o32, A1
lxvd2x vs18, o32, A2
lxvd2x vs26, o32, A3
xxpermdi vs32, vs0, vs1, 0
xxpermdi vs33, vs2, vs3, 0
xxpermdi vs34, vs0, vs1, 3
xxpermdi vs35, vs2, vs3, 3
lxvd2x vs3, o48, A0
lxvd2x vs11, o48, A1
lxvd2x vs19, o48, A2
lxvd2x vs27, o48, A3
xxpermdi vs36, vs4, vs5, 0
xxpermdi vs37, vs6, vs7, 0
xxpermdi vs38, vs4, vs5, 3
xxpermdi vs39, vs6, vs7, 3
lxvd2x vs4, o64, A0
lxvd2x vs12, o64, A1
lxvd2x vs20, o64, A2
lxvd2x vs28, o64, A3
lxvd2x vs0, o32, A0
lxvd2x vs1, o32, A1
lxvd2x vs2, o32, A2
lxvd2x vs3, o32, A3
lxvd2x vs5, o80, A0
lxvd2x vs13, o80, A1
lxvd2x vs21, o80, A2
lxvd2x vs29, o80, A3
lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3
lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3
lxvd2x vs4, o48, A0
lxvd2x vs5, o48, A1
lxvd2x vs6, o48, A2
lxvd2x vs7, o48, A3
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs40, vs0, vs1, 0
xxpermdi vs41, vs2, vs3, 0
xxpermdi vs42, vs0, vs1, 3
xxpermdi vs43, vs2, vs3, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs44, vs4, vs5, 0
xxpermdi vs45, vs6, vs7, 0
xxpermdi vs46, vs4, vs5, 3
xxpermdi vs47, vs6, vs7, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
lxvd2x vs0, o64, A0
lxvd2x vs1, o64, A1
lxvd2x vs2, o64, A2
lxvd2x vs3, o64, A3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
lxvd2x vs4, o80, A0
lxvd2x vs5, o80, A1
lxvd2x vs6, o80, A2
lxvd2x vs7, o80, A3
xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3
xxpermdi vs52, vs5, vs13, 0
xxpermdi vs53, vs21, vs29, 0
xxpermdi vs54, vs5, vs13, 3
xxpermdi vs55, vs21, vs29, 3
xxpermdi vs48, vs0, vs1, 0
xxpermdi vs49, vs2, vs3, 0
xxpermdi vs50, vs0, vs1, 3
xxpermdi vs51, vs2, vs3, 3
xxpermdi vs8, vs4, vs5, 0
xxpermdi vs9, vs6, vs7, 0
xxpermdi vs10, vs4, vs5, 3
xxpermdi vs11, vs6, vs7, 3
lxvd2x vs0, o96, A0
lxvd2x vs1, o96, A1
lxvd2x vs2, o96, A2
lxvd2x vs3, o96, A3
lxvd2x vs6, o112, A0
lxvd2x vs7, o112, A1
lxvd2x vs12, o112, A2
lxvd2x vs13, o112, A3
xxpermdi vs4, vs0, vs1, 0
xxpermdi vs5, vs2, vs3, 0
xxpermdi vs0, vs0, vs1, 3
xxpermdi vs2, vs2, vs3, 3
addi A0, A0, 128
addi A1, A1, 128
xxpermdi vs56, vs6, vs14, 0
xxpermdi vs57, vs22, vs30, 0
xxpermdi vs58, vs6, vs14, 3
xxpermdi vs59, vs22, vs30, 3
xxpermdi vs1, vs6, vs7, 0
xxpermdi vs3, vs12, vs13, 0
xxpermdi vs6, vs6, vs7, 3
xxpermdi vs12, vs12, vs13, 3
dcbt BO, PREB
addi A3, A3, 128
addi A2, A2, 128
xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3
dcbt BO, PREB
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
@@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO
stxvd2x vs53, o80, BO
stxvd2x vs54, o96, BO
stxvd2x vs55, o112, BO
stxvd2x vs8, o64, BO
stxvd2x vs9, o80, BO
stxvd2x vs10, o96, BO
stxvd2x vs11, o112, BO
addi BO, BO, 128
dcbt BO, PREB
stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO
stxvd2x vs59, o48, BO
stxvd2x vs60, o64, BO
stxvd2x vs61, o80, BO
stxvd2x vs62, o96, BO
stxvd2x vs63, o112, BO
stxvd2x vs4, o0, BO
stxvd2x vs5, o16, BO
stxvd2x vs0, o32, BO
stxvd2x vs2, o48, BO
stxvd2x vs1, o64, BO
stxvd2x vs3, o80, BO
stxvd2x vs6, o96, BO
stxvd2x vs12, o112, BO
addi BO, BO, 128
@@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 64
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs18, o32, A2
lxvd2x vs19, o48, A2
lxvd2x vs4, o0, A2
lxvd2x vs5, o16, A2
lxvd2x vs6, o32, A2
lxvd2x vs7, o48, A2
addi A2, A2, 64
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs26, o32, A3
lxvd2x vs27, o48, A3
lxvd2x vs12, o0, A3
lxvd2x vs13, o16, A3
lxvd2x vs50, o32, A3
lxvd2x vs51, o48, A3
addi A3, A3, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs4, vs12, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs4, vs12, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs37, vs5, vs13, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs39, vs5, vs13, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs41, vs6, vs50, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs43, vs6, vs50, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs45, vs7, vs51, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
xxpermdi vs47, vs7, vs51, 3
stxvd2x vs32, o0, BO
@@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 32
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs10, o0, A2
lxvd2x vs11, o16, A2
addi A2, A2, 32
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs12, o0, A3
lxvd2x vs13, o16, A3
addi A3, A3, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs10, vs12, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs10, vs12, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs37, vs11, vs13, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs39, vs11, vs13, 3
stxvd2x vs32, o0, BO
@@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 16
lxvd2x vs16, o0, A2
lxvd2x vs9, o0, A2
addi A2, A2, 16
lxvd2x vs24, o0, A3
lxvd2x vs10, o0, A3
addi A3, A3, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs9, vs10, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs9, vs10, 3
stxvd2x vs32, o0, BO
@@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 8
lxsdx vs16, o0, A2
lxsdx vs9, o0, A2
addi A2, A2, 8
lxsdx vs24, o0, A3
lxsdx vs10, o0, A3
addi A3, A3, 8
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs9, vs10, 0
stxvd2x vs32, o0, BO
@@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1
lxvd2x vs15, o112, A1
lxvd2x vs48, o96, A1
lxvd2x vs49, o112, A1
addi A1, A1, 128
@@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3
xxpermdi vs44, vs6, vs14, 0
xxpermdi vs45, vs6, vs14, 3
xxpermdi vs44, vs6, vs48, 0
xxpermdi vs45, vs6, vs48, 3
xxpermdi vs46, vs7, vs15, 0
xxpermdi vs47, vs7, vs15, 3
xxpermdi vs46, vs7, vs49, 0
xxpermdi vs47, vs7, vs49, 3
stxvd2x vs32, o0, BO

View File

@@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dgemm_tcopy_macros_16_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
//addi SP, SP, -208
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11,SP,288
stvx v20, r11,r0
addi r11,r11,16
stvx v21, r11,r0
addi r11,r11,16
stvx v22, r11,r0
addi r11,r11,16
stvx v23, r11,r0
addi r11,r11,16
stvx v24, r11,r0
addi r11,r11,16
stvx v25, r11,r0
addi r11,r11,16
stvx v26, r11,r0
addi r11,r11,16
stvx v27, r11,r0
addi r11,r11,16
stvx v28, r11,r0
addi r11,r11,16
stvx v29, r11,r0
addi r11,r11,16
stvx v30, r11,r0
addi r11,r11,16
stvx v31, r11,r0
li r11,0
std r14,0(SP)
std r15,8(SP)
std r16,16(SP)
std r17,24(SP)
std r18,32(SP)
std r19,40(SP)
std r20,48(SP)
std r21,56(SP)
std r22,64(SP)
std r23,72(SP)
std r24,80(SP)
std r25,88(SP)
std r26,96(SP)
std r27,104(SP)
std r28,112(SP)
std r29,120(SP)
std r30,128(SP)
std r31,136(SP)
cmpwi cr0, M, 0
ble- L999
@@ -198,8 +172,7 @@ li r11,0
add B2, B2, B
add B1, B1, B
//li PREA, 384
li PREA, 576
li PREA, 384
addi PREB, M16, 128
li o8, 8
@@ -213,52 +186,27 @@ L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11,r3
addi r11,r11,16
lvx v21, r11,r3
addi r11,r11,16
lvx v22, r11,r3
addi r11,r11,16
lvx v23, r11,r3
addi r11,r11,16
lvx v24, r11,r3
addi r11,r11,16
lvx v25, r11,r3
addi r11,r11,16
lvx v26, r11,r3
addi r11,r11,16
lvx v27, r11,r3
addi r11,r11,16
lvx v28, r11,r3
addi r11,r11,16
lvx v29, r11,r3
addi r11,r11,16
lvx v30, r11,r3
addi r11,r11,16
lvx v31, r11,r3
li r11,0
ld r14,0(SP)
ld r15,8(SP)
ld r16,16(SP)
ld r17,24(SP)
ld r18,32(SP)
ld r19,40(SP)
ld r20,48(SP)
ld r21,56(SP)
ld r22,64(SP)
ld r23,72(SP)
ld r24,80(SP)
ld r25,88(SP)
ld r26,96(SP)
ld r27,104(SP)
ld r28,112(SP)
ld r29,120(SP)
ld r30,128(SP)
ld r31,136(SP)
addi SP, SP, STACKSIZE
//addi SP, SP, 208
blr
EPILOGUE

View File

@@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
lxvd2x vs4, o0, A3
lxvd2x vs5, o16, A3
lxvd2x vs6, o32, A3
lxvd2x vs7, o48, A3
addi A3, A3, 64
lxvd2x vs36, o0, A0
@@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
lxvd2x vs12, o0, A2
lxvd2x vs13, o16, A2
lxvd2x vs2, o32, A2
lxvd2x vs3, o48, A2
addi A2, A2, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
lxvd2x vs8, o0, A3
lxvd2x vs9, o16, A3
lxvd2x vs10, o32, A3
lxvd2x vs11, o48, A3
addi A3, A3, 64
mr T1, BO
@@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
stxvd2x vs12, o0, T1
stxvd2x vs13, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
stxvd2x vs4, o0, T1
stxvd2x vs5, o16, T1
stxvd2x vs6, o32, T1
stxvd2x vs7, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
.endm

886
kernel/power/dgemv_t.c Normal file
View File

@@ -0,0 +1,886 @@
/***************************************************************************
Copyright (c) 2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 8192
#define PREFETCH 1
#include <altivec.h>
#define HAVE_KERNEL4x8_ASM 1
#if defined(HAVE_KERNEL4x8_ASM)
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
BLASLONG off2;
BLASLONG tempR;
__asm__(
"sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
"sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
"xxlxor 34,34,34 \n\t"
"xxlxor 35,34,34 \n\t"
"add %[a2], %[a0], %[temp] \n\t"
"add %[a1], %[a0], %[off] \n\t"
"xxlxor 4,34,34 \n\t"
"xxlxor 5,34,34 \n\t"
"xxlxor 6,34,34 \n\t"
"xxlxor 7,34,34 \n\t"
"add %[a3], %[a2], %[off] \n\t"
"add %[a4], %[a2], %[temp] \n\t"
"xxlxor 8,34,34 \n\t"
"xxlxor 9,34,34 \n\t"
"add %[a5], %[a3], %[temp] \n\t"
"li %[off],0 \n\t"
"li %[off2],16 \n\t"
"add %[a6], %[a4], %[temp] \n\t"
"add %[a7], %[a5], %[temp] \n\t"
"lxvd2x 32, %[x], %[off] \n\t"
"lxvd2x 36, %[a0], %[off] \n\t"
"lxvd2x 38, %[a1], %[off] \n\t"
"lxvd2x 40, %[a2], %[off] \n\t"
"lxvd2x 42, %[a3], %[off] \n\t"
"lxvd2x 44, %[a4], %[off] \n\t"
"lxvd2x 46, %[a5], %[off] \n\t"
"lxvd2x 48, %[a6], %[off] \n\t"
"lxvd2x 50, %[a7], %[off] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"lxvd2x 37, %[a0], %[off2] \n\t"
"lxvd2x 39, %[a1], %[off2] \n\t"
"lxvd2x 41, %[a2], %[off2] \n\t"
"lxvd2x 43, %[a3], %[off2] \n\t"
"lxvd2x 45, %[a4], %[off2] \n\t"
"lxvd2x 47, %[a5], %[off2] \n\t"
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
#if defined(PREFETCH)
"li %[temp],896 \n\t"
#endif
"addic. %[n],%[n],-4 \n\t"
"li %[off],32 \n\t"
"ble- 2f \n\t"
//--------------------------------------------------
".p2align 5 \n\t"
"1: \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
"lxvd2x 36, %[a0], %[off] \n\t"
"lxvd2x 38, %[a1], %[off] \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"lxvd2x 40, %[a2], %[off] \n\t"
"lxvd2x 42, %[a3], %[off] \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"lxvd2x 44, %[a4], %[off] \n\t"
"lxvd2x 46, %[a5], %[off] \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"lxvd2x 48, %[a6], %[off] \n\t"
"lxvd2x 50, %[a7], %[off] \n\t"
"lxvd2x 32, %[x], %[off] \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvd2x 37, %[a0], %[off2] \n\t"
"lxvd2x 39, %[a1], %[off2] \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvd2x 41, %[a2], %[off2] \n\t"
"lxvd2x 43, %[a3], %[off2] \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvd2x 45, %[a4], %[off2] \n\t"
"lxvd2x 47, %[a5], %[off2] \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"addic. %[n],%[n],-4 \n\t"
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t"
"lxvd2x 36, %[a0], %[off] \n\t"
"lxvd2x 38, %[a1], %[off] \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"lxvd2x 40, %[a2], %[off] \n\t"
"lxvd2x 42, %[a3], %[off] \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"lxvd2x 44, %[a4], %[off] \n\t"
"lxvd2x 46, %[a5], %[off] \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"lxvd2x 48, %[a6], %[off] \n\t"
"lxvd2x 50, %[a7], %[off] \n\t"
"lxvd2x 32, %[x], %[off] \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvd2x 37, %[a0], %[off2] \n\t"
"lxvd2x 39, %[a1], %[off2] \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvd2x 41, %[a2], %[off2] \n\t"
"lxvd2x 43, %[a3], %[off2] \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvd2x 45, %[a4], %[off2] \n\t"
"lxvd2x 47, %[a5], %[off2] \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"addic. %[n],%[n],-4 \n\t"
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t"
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
#if defined(PREFETCH)
"addi %[temp],%[temp],128 \n\t"
#endif
"addi %[off2], %[off2],32 \n\t"
"lxvd2x 36, %[a0], %[off] \n\t"
"lxvd2x 38, %[a1], %[off] \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"lxvd2x 40, %[a2], %[off] \n\t"
"lxvd2x 42, %[a3], %[off] \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"lxvd2x 44, %[a4], %[off] \n\t"
"lxvd2x 46, %[a5], %[off] \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"lxvd2x 48, %[a6], %[off] \n\t"
"lxvd2x 50, %[a7], %[off] \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a0] \n\t"
#endif
"lxvd2x 32, %[x], %[off] \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvd2x 37, %[a0], %[off2] \n\t"
"lxvd2x 39, %[a1], %[off2] \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a1] \n\t"
#endif
"lxvd2x 41, %[a2], %[off2] \n\t"
"addi %[off], %[off],32 \n\t"
"lxvd2x 43, %[a3], %[off2] \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvd2x 45, %[a4], %[off2] \n\t"
"lxvd2x 47, %[a5], %[off2] \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a3] \n\t"
#endif
"lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
"addic. %[n],%[n],-4 \n\t"
"ble- 2f \n\t"
"addi %[off2], %[off2],32 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a2] \n\t"
#endif
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"lxvd2x 36, %[a0], %[off] \n\t"
"lxvd2x 38, %[a1], %[off] \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"lxvd2x 40, %[a2], %[off] \n\t"
"lxvd2x 42, %[a3], %[off] \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a4] \n\t"
#endif
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"lxvd2x 44, %[a4], %[off] \n\t"
"lxvd2x 46, %[a5], %[off] \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"lxvd2x 48, %[a6], %[off] \n\t"
"lxvd2x 50, %[a7], %[off] \n\t"
"lxvd2x 32, %[x], %[off] \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a5] \n\t"
#endif
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"lxvd2x 37, %[a0], %[off2] \n\t"
"lxvd2x 39, %[a1], %[off2] \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"addi %[off], %[off],32 \n\t"
"lxvd2x 41, %[a2], %[off2] \n\t"
"lxvd2x 43, %[a3], %[off2] \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a6] \n\t"
#endif
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"lxvd2x 45, %[a4], %[off2] \n\t"
"lxvd2x 47, %[a5], %[off2] \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[a7] \n\t"
#endif
"lxvd2x 49, %[a6], %[off2] \n\t"
"addic. %[n],%[n],-4 \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t"
#if defined(PREFETCH)
"dcbt %[temp],%[x] \n\t"
#endif
"bgt+ 1b \n\t"
".p2align 5 \n\t"
"2: \n\t"
//--------------------------------------------
"xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t"
"xvmaddadp 4,40,32 \n\t"
"xvmaddadp 5,42,32 \n\t"
"xvmaddadp 6,44,32 \n\t"
"xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t"
"xxspltd 36, %x[alpha], 0 \n\t"
"xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t"
"xvmaddadp 4,41,33 \n\t"
"xvmaddadp 5,43,33 \n\t"
"xvmaddadp 6,45,33 \n\t"
"xvmaddadp 7,47,33 \n\t"
"xvmaddadp 8,49,33 \n\t"
"xvmaddadp 9,51,33 \n\t"
"lxvd2x 37, 0, %[y] \n\t"
"li %[off2],16 \n\t"
"lxvd2x 38, %[off2], %[y] \n\t"
"li %[off2],32 \n\t"
"lxvd2x 39, %[off2], %[y] \n\t"
"li %[off2],48 \n\t"
"lxvd2x 40, %[off2], %[y] \n\t"
"xxmrgld 42,34,35 \n\t"
"xxmrghd 43,34,35 \n\t"
"xxmrgld 44,4,5 \n\t"
"xxmrghd 45,4,5 \n\t"
"xvadddp 42,42,43 \n\t"
"xxmrgld 46,6,7 \n\t"
"xxmrghd 47,6,7 \n\t"
"xvadddp 44,44,45 \n\t"
"xxmrgld 48,8,9 \n\t"
"xxmrghd 49,8,9 \n\t"
"xvadddp 46,46,47 \n\t"
"xvmaddadp 37,42,36 \n\t"
"xvmaddadp 38,44,36 \n\t"
"xvadddp 48,48,49 \n\t"
"xvmaddadp 39,46,36 \n\t"
"stxvd2x 37, 0, %[y] \n\t"
"li %[off],16 \n\t"
"stxvd2x 38, %[off], %[y] \n\t"
"xvmaddadp 40,48,36 \n\t"
"li %[off],32 \n\t"
"stxvd2x 39, %[off], %[y] \n\t"
"stxvd2x 40, %[off2], %[y] \n\t"
: [memy] "+m" (*(const double (*)[8])y),
[n] "+&r" (n),
[a0] "=b" (a0),
[a1] "=&b" (a1),
[a2] "=&b" (a2),
[a3] "=&b" (a3),
[a4] "=&b" (a4),
[a5] "=&b" (a5),
[a6] "=&b" (a6),
[a7] "=&b" (a7),
[off] "+&b" (lda),
[off2]"=&b" (off2),
[temp] "=&b" (tempR)
: [memx] "m" (*(const double (*)[n])x),
[mem_ap] "m" (*(const double (*)[]) ap),
[alpha] "d" (alpha),
"[a0]" (ap),
[x] "b" (x),
[y] "b" (y)
: "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
"vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return;
}
#else
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
#if defined(PREFETCH)
BLASLONG j, c, k;
#endif
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
__vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
register __vector double temp0 = {0, 0};
register __vector double temp1 = {0, 0};
register __vector double temp2 = {0, 0};
register __vector double temp3 = {0, 0};
register __vector double temp4 = {0, 0};
register __vector double temp5 = {0, 0};
register __vector double temp6 = {0, 0};
register __vector double temp7 = {0, 0};
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
a4 = a3 + lda;
a5 = a4 + lda;
a6 = a5 + lda;
a7 = a6 + lda;
va0 = (__vector double*) a0;
va1 = (__vector double*) a1;
va2 = (__vector double*) a2;
va3 = (__vector double*) a3;
va4 = (__vector double*) a4;
va5 = (__vector double*) a5;
va6 = (__vector double*) a6;
va7 = (__vector double*) a7;
v_x = (__vector double*) x;
#if defined(PREFETCH)
c = n >> 1;
for (j = 0; j < c; j += 64) {
k = (c - j) > 64 ? 64 : (c - j);
__builtin_prefetch(v_x + 64);
__builtin_prefetch(va0 + 64);
__builtin_prefetch(va1 + 64);
__builtin_prefetch(va2 + 64);
__builtin_prefetch(va3 + 64);
__builtin_prefetch(va4 + 64);
__builtin_prefetch(va5 + 64);
__builtin_prefetch(va6 + 64);
__builtin_prefetch(va7 + 64);
for (i = 0; i < k; i += 2) {
#else
for (i = 0; i < n/2; i += 2) {
#endif
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
temp4 += v_x[i] * va4[i];
temp5 += v_x[i] * va5[i];
temp6 += v_x[i] * va6[i];
temp7 += v_x[i] * va7[i];
temp0 += v_x[i + 1] * va0[i + 1];
temp1 += v_x[i + 1] * va1[i + 1];
temp2 += v_x[i + 1] * va2[i + 1];
temp3 += v_x[i + 1] * va3[i + 1];
temp4 += v_x[i + 1] * va4[i + 1];
temp5 += v_x[i + 1] * va5[i + 1];
temp6 += v_x[i + 1] * va6[i + 1];
temp7 += v_x[i + 1] * va7[i + 1];
}
#if defined(PREFETCH)
va0 += 64;
va1 += 64;
va2 += 64;
va3 += 64;
va4 += 64;
va5 += 64;
va6 += 64;
va7 += 64;
v_x += 64;
}
#endif
y[0] += alpha * (temp0[0] + temp0[1]);
y[1] += alpha * (temp1[0] + temp1[1]);
y[2] += alpha * (temp2[0] + temp2[1]);
y[3] += alpha * (temp3[0] + temp3[1]);
y[4] += alpha * (temp4[0] + temp4[1]);
y[5] += alpha * (temp5[0] + temp5[1]);
y[6] += alpha * (temp6[0] + temp6[1]);
y[7] += alpha * (temp7[0] + temp7[1]);
}
#endif
static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i = 0;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
__vector double* va0 = (__vector double*) a0;
__vector double* va1 = (__vector double*) a1;
__vector double* va2 = (__vector double*) a2;
__vector double* va3 = (__vector double*) a3;
__vector double* v_x = (__vector double*) x;
register __vector double temp0 = {0, 0};
register __vector double temp1 = {0, 0};
register __vector double temp2 = {0, 0};
register __vector double temp3 = {0, 0};
register __vector double temp4 = {0, 0};
register __vector double temp5 = {0, 0};
register __vector double temp6 = {0, 0};
register __vector double temp7 = {0, 0};
for (i = 0; i < n / 2; i += 2) {
temp0 += v_x[i] * va0[i];
temp1 += v_x[i] * va1[i];
temp2 += v_x[i] * va2[i];
temp3 += v_x[i] * va3[i];
temp4 += v_x[i + 1] * va0[i + 1];
temp5 += v_x[i + 1] * va1[i + 1];
temp6 += v_x[i + 1] * va2[i + 1];
temp7 += v_x[i + 1] * va3[i + 1];
}
temp0 += temp4;
temp1 += temp5;
temp2 += temp6;
temp3 += temp7;
y[0] += alpha * (temp0[0] + temp0[1]);
y[1] += alpha * (temp1[0] + temp1[1]);
y[2] += alpha * (temp2[0] + temp2[1]);
y[3] += alpha * (temp3[0] + temp3[1]);
}
static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
__vector double* va0 = (__vector double*) a0;
__vector double* va1 = (__vector double*) a1;
__vector double* v_x = (__vector double*) x;
__vector double temp0 = {0, 0};
__vector double temp1 = {0, 0};
for (i = 0; i < n / 2; i += 2) {
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1];
}
y[0] += alpha * (temp0[0] + temp0[1]);
y[inc_y] += alpha * (temp1[0] + temp1[1]);
}
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
__vector double* va0 = (__vector double*) a0;
__vector double* v_x = (__vector double*) x;
__vector double temp0 = {0, 0};
for (i = 0; i < n / 2; i += 2) {
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
}
*y += alpha * (temp0[0] + temp0[1]);
}
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest++ = *src;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[8], *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);
xbuffer = buffer;
n1 = n >> 3;
n2 = n & 7;
m3 = m & 3;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if (inc_x != 1)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
BLASLONG lda8 = lda << 3;
if (inc_y == 1) {
for (i = 0; i < n1; i++) {
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
y_ptr += 8;
a_ptr += lda8;
#if defined(PREFETCH)
__builtin_prefetch(y_ptr+64);
#endif
}
} else {
for (i = 0; i < n1; i++) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
ybuffer[4] = 0;
ybuffer[5] = 0;
ybuffer[6] = 0;
ybuffer[7] = 0;
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
*y_ptr += ybuffer[4];
y_ptr += inc_y;
*y_ptr += ybuffer[5];
y_ptr += inc_y;
*y_ptr += ybuffer[6];
y_ptr += inc_y;
*y_ptr += ybuffer[7];
y_ptr += inc_y;
a_ptr += lda8;
}
}
if (n2 & 4) {
ybuffer[0] = 0;
ybuffer[1] = 0;
ybuffer[2] = 0;
ybuffer[3] = 0;
dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
a_ptr += lda<<2;
*y_ptr += ybuffer[0];
y_ptr += inc_y;
*y_ptr += ybuffer[1];
y_ptr += inc_y;
*y_ptr += ybuffer[2];
y_ptr += inc_y;
*y_ptr += ybuffer[3];
y_ptr += inc_y;
}
if (n2 & 2) {
dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
a_ptr += lda << 1;
y_ptr += 2 * inc_y;
}
if (n2 & 1) {
dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
a_ptr += lda;
y_ptr += inc_y;
}
a += NB;
x += NB * inc_x;
}
if (m3 == 0) return (0);
x_ptr = x;
a_ptr = a;
if (m3 == 3) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp2 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 3 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
aj += 12;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
aj += 3;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}
if (m3 == 2) {
FLOAT xtemp0 = *x_ptr * alpha;
x_ptr += inc_x;
FLOAT xtemp1 = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 2 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
aj += 8;
}
for (; j < n; j++) {
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
aj += 2;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}
FLOAT xtemp = *x_ptr * alpha;
FLOAT *aj = a_ptr;
y_ptr = y;
if (lda == 1 && inc_y == 1) {
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += aj[j] * xtemp;
y_ptr[j + 1] += aj[j + 1] * xtemp;
y_ptr[j + 2] += aj[j + 2] * xtemp;
y_ptr[j + 3] += aj[j + 3] * xtemp;
}
for (; j < n; j++) {
y_ptr[j] += aj[j] * xtemp;
}
} else {
if (inc_y == 1) {
BLASLONG register lda2 = lda << 1;
BLASLONG register lda4 = lda << 2;
BLASLONG register lda3 = lda2 + lda;
for (j = 0; j < (n & -4); j += 4) {
y_ptr[j] += *aj * xtemp;
y_ptr[j + 1] += *(aj + lda) * xtemp;
y_ptr[j + 2] += *(aj + lda2) * xtemp;
y_ptr[j + 3] += *(aj + lda3) * xtemp;
aj += lda4;
}
for (; j < n; j++) {
y_ptr[j] += *aj * xtemp;
aj += lda;
}
} else {
for (j = 0; j < n; j++) {
*y_ptr += *aj * xtemp;
y_ptr += inc_y;
aj += lda;
}
}
}
return (0);
}

383
kernel/power/idamax.c Normal file
View File

@@ -0,0 +1,383 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#include <altivec.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
/**
* Find maximum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
register __vector long long start = {1,0};
register __vector long long temp_add_index = {2, 2};
__asm__(
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//jump first half forward
"b 2f \n\t"
//===================================================================
".p2align 5 \n\t"
"1: \n\t"
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
//load next 64
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose bigger from first and second part
"xvcmpgtdp 4,5 , 0 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
//load next 64
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//<-----------jump here from first load
"2: \n\t"
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
//load next 64
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose bigger from first and second part
"xvcmpgtdp 4,5 , 0 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
//load next 64
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//decrement n
"addic. %[n], %[n], -32 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
//==============================================================================
"xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
//choose bigger from first and second part
"xvcmpgtdp 4,5 , 0 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first bigger
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t"
"3: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t"
"mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
: [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return index;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
FLOAT maxf = 0.0;
BLASLONG max = 0;
if (n <= 0 || inc_x <= 0) return (max);
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
max = diamax_kernel_32(n1, x, &maxf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) > maxf) {
max = i;
maxf = ABS(x[i]);
}
i++;
}
return (max + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) > maxf) {
max = j + 1;
maxf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) > maxf) {
max = j + 2;
maxf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) > maxf) {
max = j + 3;
maxf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) > maxf) {
max = j;
maxf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (max + 1);
}
}

384
kernel/power/idamin.c Normal file
View File

@@ -0,0 +1,384 @@
/***************************************************************************
Copyright (c) 2013-2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#if defined(DOUBLE)
#define ABS fabs
#else
#define ABS fabsf
#endif
/**
* Find minimum index
* Warning: requirements n>0 and n % 32 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
register __vector long long start = {1,0};
register __vector long long temp_add_index = {2, 2};
__asm__(
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
"xvabsdp 39, 39 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//jump first half forward
"b 2f \n\t"
//===================================================================
".p2align 5 \n\t"
"1: \n\t"
"xvcmpgedp 2,44,45 \n\t "
"xvcmpgedp 3,46,47 \n\t "
"xvcmpgedp 4,48,49 \n\t "
"xvcmpgedp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgedp 2,0, 1 \n\t"
"xvcmpgedp 3, 45,47 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
//load next 64
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose smaller from first and second part
"xvcmpgedp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
//load next 64
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgedp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//<-----------jump here from first load
"2: \n\t"
"xvcmpgedp 2,44,45 \n\t "
"xvcmpgedp 3,46,47 \n\t "
"xvcmpgedp 4,48,49 \n\t "
"xvcmpgedp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgedp 2,0, 1 \n\t"
"xvcmpgedp 3, 45,47 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
//load next 64
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
//choose smaller from first and second part
"xvcmpgedp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
//load next 64
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgedp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
//update index += 8
"vaddudm 5,5,4 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//decrement n
"addic. %[n], %[n], -32 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
//==============================================================================
"xvcmpgedp 2,44,45 \n\t "
"xvcmpgedp 3,46,47 \n\t "
"xvcmpgedp 4,48,49 \n\t "
"xvcmpgedp 5,50,51 \n\t"
"xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t"
"xxsel 33,42,43,3 \n\t"
"xxsel 1,46,47,3 \n\t"
"xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t"
"xvcmpgedp 2,0, 1 \n\t"
"xvcmpgedp 3, 45,47 \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 0 ,0,1,2 \n\t"
"xxsel 34,34,35,3 \n\t"
"xxsel 5,45,47,3 \n\t"
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
//choose smaller from first and second part
"xvcmpgedp 4, 0,5 \n\t"
"xxsel 3, 0,5,4 \n\t"
"xxsel 33,32,34,4 \n\t"
"vaddudm 1,1,5 \n\t" // get real index for first smaller
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgedp 2,39, 3 \n\t"
"xxsel 39,39,3,2 \n\t"
"xxsel 38,38,33,2 \n\t"
///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"xvcmpgedp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t"
"3: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t"
"mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
: [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return index;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
BLASLONG i = 0;
BLASLONG j = 0;
BLASLONG min = 0;
FLOAT minf = 0.0;
if (n <= 0 || inc_x <= 0) return (min);
minf = ABS(x[0]); //index's not incremented
if (inc_x == 1) {
BLASLONG n1 = n & -32;
if (n1 > 0) {
min = diamin_kernel_32(n1, x, &minf);
i = n1;
}
while (i < n) {
if (ABS(x[i]) < minf) {
min = i;
minf = ABS(x[i]);
}
i++;
}
return (min + 1);
} else {
BLASLONG n1 = n & -4;
while (j < n1) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
if (ABS(x[i + inc_x]) < minf) {
min = j + 1;
minf = ABS(x[i + inc_x]);
}
if (ABS(x[i + 2 * inc_x]) < minf) {
min = j + 2;
minf = ABS(x[i + 2 * inc_x]);
}
if (ABS(x[i + 3 * inc_x]) < minf) {
min = j + 3;
minf = ABS(x[i + 3 * inc_x]);
}
i += inc_x * 4;
j += 4;
}
while (j < n) {
if (ABS(x[i]) < minf) {
min = j;
minf = ABS(x[i]);
}
i += inc_x;
j++;
}
return (min + 1);
}
}

362
kernel/power/izamax.c Normal file
View File

@@ -0,0 +1,362 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
/**
* Find maximum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param maxf (out) maximum absolute value .( only for output )
* @return index
*/
static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
BLASLONG index;
register __vector long long start = {1,0};
register __vector long long temp_add_index = {2, 2};
__asm__(
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//jump first half forward
"b 2f \n\t"
".p2align 5 \n\t"
"1: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"xvcmpgtdp 2,1,0 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
//cmp with previous
"xvcmpgtdp 4,3,39 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//>>/////////////////////////////// half start
"2: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"xvcmpgtdp 2,1,0 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
//cmp with previous
"xvcmpgtdp 4,3,39 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//decrement n
"addic. %[n], %[n], -16 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t "
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xvcmpgtdp 2,1,0 \n\t "
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
//cmp with previous
"xvcmpgtdp 4,3,39 \n\t "
"vaddudm 5,5,4 \n\t"
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t"
"3: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t"
"mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
: [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return index;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i = 0;
BLASLONG ix = 0;
FLOAT maxf = 0;
BLASLONG max = 0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(max);
if (inc_x == 1) {
BLASLONG n1 = n & -16;
if (n1 > 0) {
max = ziamax_kernel_16(n1, x, &maxf);
i = n1;
ix = n1 << 1;
}
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (max + 1);
} else {
inc_x2 = 2 * inc_x;
maxf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) > maxf )
{
max = i;
maxf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (max + 1);
}
}

361
kernel/power/izamin.c Normal file
View File

@@ -0,0 +1,361 @@
/***************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#include <math.h>
#define ABS fabs
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
/**
* Find minimum index
* Warning: requirements n>0 and n % 16 == 0
* @param n
* @param x pointer to the vector
* @param minf (out) minimum absolute value .( only for output )
* @return minimum index
*/
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
BLASLONG index;
register __vector long long start = {1,0};
register __vector long long temp_add_index = {2, 2};
__asm__(
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//jump first half forward
"b 2f \n\t"
".p2align 5 \n\t"
"1: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgedp 50,46,47 \n\t "
"xvcmpgedp 51,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"xvcmpgedp 2,0,1 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
//cmp with previous
"xvcmpgedp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//>>/////////////////////////////// half start
"2: \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgedp 50,46,47 \n\t "
"xvcmpgedp 51,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
"xvcmpgedp 2,0,1 \n\t "
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
//cmp with previous
"xvcmpgedp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
//select with previous
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
"xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t"
"xvabsdp 46, 46 \n\t"
"xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t"
"xvabsdp 51, 51 \n\t"
//decrement n
"addic. %[n], %[n], -16 \n\t"
//Loop back if >0
"bgt+ 1b \n\t"
"xxmrghd 0,44,45 \n\t"
"xxmrgld 1,44,45 \n\t"
"xxmrghd 2,46,47 \n\t"
"xxmrgld 3,46,47 \n\t"
"xxmrghd 4,48,49 \n\t"
"xxmrgld 5,48,49 \n\t"
"xxmrghd 44,50,51 \n\t"
"xxmrgld 45,50,51 \n\t"
"xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t"
"xvcmpgedp 50,46,47 \n\t "
"xvcmpgedp 51,48,49 \n\t "
"xxsel 32,40,41,50 \n\t"
"xxsel 0,46,47,50 \n\t"
"xxsel 33,42,43,51 \n\t"
"xxsel 1,48,49,51 \n\t"
"xvcmpgedp 2,0,1 \n\t "
"xxsel 32,32,33,2 \n\t"
"xxsel 3,0,1,2 \n\t"
"vaddudm 0,0,5 \n\t"
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
//cmp with previous
"xvcmpgedp 4,39,3 \n\t "
"vaddudm 5,5,4 \n\t"
"xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t"
///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t"
"xxspltd 40,39,1 \n\t"
"xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14
"bc 14,24, 3f \n\t"
"xvcmpgedp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t"
"3: \n\t"
//if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t"
"vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t"
"mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
: [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
);
return index;
}
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
BLASLONG ix=0;
FLOAT minf;
BLASLONG min=0;
BLASLONG inc_x2;
if (n <= 0 || inc_x <= 0) return(min);
if (inc_x == 1) {
minf = CABS1(x,0); //index will not be incremented
BLASLONG n1 = n & -16;
if (n1 > 0) {
min = ziamin_kernel_16_TUNED(n1, x, &minf);
i = n1;
ix = n1 << 1;
}
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += 2;
i++;
}
return (min + 1);
} else {
inc_x2 = 2 * inc_x;
minf = CABS1(x,0);
ix += inc_x2;
i++;
while(i < n)
{
if( CABS1(x,ix) < minf )
{
min = i;
minf = CABS1(x,ix);
}
ix += inc_x2;
i++;
}
return (min + 1);
}
}

View File

@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemm_tcopy_macros_16_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
@@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11 ,SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
cmpwi cr0, M, 0
ble- L999
@@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
@@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP ,288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11 ,16
stvx v31, r11, r0
li r11,0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@@ -204,49 +180,24 @@ L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11,r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11,0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
lxvd2x vs2, o0, A2
lxvd2x vs3, o16, A2
lxvd2x vs4, o32, A2
lxvd2x vs5, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
lxvd2x vs6, o0, A3
lxvd2x vs7, o16, A3
lxvd2x vs8, o32, A3
lxvd2x vs9, o48, A3
addi A3, A3, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
lxvd2x vs10, o0, A3
lxvd2x vs11, o16, A3
lxvd2x vs12, o32, A3
lxvd2x vs13, o48, A3
addi A3, A3, 64
@@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
stxvd2x vs2, o0, T1
stxvd2x vs3, o16, T1
stxvd2x vs4, o32, T1
stxvd2x vs5, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
stxvd2x vs6, o0, T1
stxvd2x vs7, o16, T1
stxvd2x vs8, o32, T1
stxvd2x vs9, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
stxvd2x vs10, o0, T1
stxvd2x vs11, o16, T1
stxvd2x vs12, o32, T1
stxvd2x vs13, o48, T1
.endm

958
kernel/power/zgemv_n_4.c Normal file
View File

@@ -0,0 +1,958 @@
/***************************************************************************
Copyright (c) 2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdlib.h>
#include <stdio.h>
#include "common.h"
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#define HAVE_KERNEL_ADDY 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <altivec.h>
#endif
//
#define NBMAX 4096
#ifdef HAVE_KERNEL_4x4_VEC_ASM
#elif HAVE_KERNEL_4x4_VEC
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector double vx0_r = {x[0], x[0]};
register __vector double vx0_i = {-x[1], x[1]};
register __vector double vx1_r = {x[2], x[2]};
register __vector double vx1_i = {-x[3], x[3]};
register __vector double vx2_r = {x[4], x[4]};
register __vector double vx2_i = {-x[5], x[5]};
register __vector double vx3_r = {x[6], x[6]};
register __vector double vx3_i = {-x[7], x[7]};
#else
register __vector double vx0_r = {x[0], -x[0]};
register __vector double vx0_i = {x[1], x[1]};
register __vector double vx1_r = {x[2], -x[2]};
register __vector double vx1_i = {x[3], x[3]};
register __vector double vx2_r = {x[4], -x[4]};
register __vector double vx2_i = {x[5], x[5]};
register __vector double vx3_r = {x[6], -x[6]};
register __vector double vx3_i = {x[7], x[7]};
#endif
register __vector double *vy = (__vector double *) y;
register __vector double *vptr_a0 = (__vector double *) a0;
register __vector double *vptr_a1 = (__vector double *) a1;
register __vector double *vptr_a2 = (__vector double *) a2;
register __vector double *vptr_a3 = (__vector double *) a3;
register __vector double vy_0;
register __vector double va0;
register __vector double va1;
register __vector double va2;
register __vector double va3;
register __vector double vy_1;
register __vector double va0_1;
register __vector double va1_1;
register __vector double va2_1;
register __vector double va3_1;
register __vector double vy_2;
register __vector double va0_2;
register __vector double va1_2;
register __vector double va2_2;
register __vector double va3_2;
register __vector double vy_3;
register __vector double va0_3;
register __vector double va1_3;
register __vector double va2_3;
register __vector double va3_3;
BLASLONG i = 0;
while (i < n) {
vy_0 = vy[i];
va0 = vptr_a0[i];
va1 = vptr_a1[i];
va2 = vptr_a2[i];
va3 = vptr_a3[i];
vy_1 = vy[i + 1];
va0_1 = vptr_a0[i + 1];
va1_1 = vptr_a1[i + 1];
va2_1 = vptr_a2[i + 1];
va3_1 = vptr_a3[i + 1];
vy_2 = vy[i + 2];
va0_2 = vptr_a0[i + 2];
va1_2 = vptr_a1[i + 2];
va2_2 = vptr_a2[i + 2];
va3_2 = vptr_a3[i + 2];
vy_3 = vy[i + 3];
va0_3 = vptr_a0[i + 3];
va1_3 = vptr_a1[i + 3];
va2_3 = vptr_a2[i + 3];
va3_3 = vptr_a3[i + 3];
vy_0 += va0*vx0_r;
vy_1 += va0_1*vx0_r;
vy_2 += va0_2*vx0_r;
vy_3 += va0_3*vx0_r;
vy_0 += va1*vx1_r;
vy_1 += va1_1*vx1_r;
vy_2 += va1_2*vx1_r;
vy_3 += va1_3*vx1_r;
va0 = vec_xxpermdi(va0, va0, 2);
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
vy_0 += va2*vx2_r;
vy_1 += va2_1*vx2_r;
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
vy_2 += va2_2*vx2_r;
vy_3 += va2_3*vx2_r;
va1 = vec_xxpermdi(va1, va1, 2);
va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
vy_0 += va3*vx3_r;
vy_1 += va3_1*vx3_r;
va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
vy_2 += va3_2*vx3_r;
vy_3 += va3_3*vx3_r;
va2 = vec_xxpermdi(va2, va2, 2);
va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
vy_0 += va0*vx0_i;
vy_1 += va0_1*vx0_i;
va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
vy_2 += va0_2*vx0_i;
vy_3 += va0_3*vx0_i;
va3 = vec_xxpermdi(va3, va3, 2);
va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
vy_0 += va1*vx1_i;
vy_1 += va1_1*vx1_i;
va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
vy_2 += va1_2*vx1_i;
vy_3 += va1_3*vx1_i;
vy_0 += va2*vx2_i;
vy_1 += va2_1*vx2_i;
vy_2 += va2_2*vx2_i;
vy_3 += va2_3*vx2_i;
vy_0 += va3*vx3_i;
vy_1 += va3_1*vx3_i;
vy_2 += va3_2*vx3_i;
vy_3 += va3_3*vx3_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
vy[i + 2] = vy_2;
vy[i + 3] = vy_3;
i += 4;
}
}
#else
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
#else
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_4x2_VEC
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector double vx0_r = {x[0], x[0]};
register __vector double vx0_i = {-x[1], x[1]};
register __vector double vx1_r = {x[2], x[2]};
register __vector double vx1_i = {-x[3], x[3]};
#else
register __vector double vx0_r = {x[0], -x[0]};
register __vector double vx0_i = {x[1], x[1]};
register __vector double vx1_r = {x[2], -x[2]};
register __vector double vx1_i = {x[3], x[3]};
#endif
register __vector double *vy = (__vector double *) y;
register __vector double *vptr_a0 = (__vector double *) a0;
register __vector double *vptr_a1 = (__vector double *) a1;
for (i = 0; i < n; i += 4) {
register __vector double vy_0 = vy[i];
register __vector double vy_1 = vy[i + 1];
register __vector double vy_2 = vy[i + 2];
register __vector double vy_3 = vy[i + 3];
register __vector double va0 = vptr_a0[i];
register __vector double va0_1 = vptr_a0[i + 1];
register __vector double va0_2 = vptr_a0[i + 2];
register __vector double va0_3 = vptr_a0[i + 3];
register __vector double va1 = vptr_a1[i];
register __vector double va1_1 = vptr_a1[i + 1];
register __vector double va1_2 = vptr_a1[i + 2];
register __vector double va1_3 = vptr_a1[i + 3];
vy_0 += va0*vx0_r;
vy_1 += va0_1*vx0_r;
vy_2 += va0_2*vx0_r;
vy_3 += va0_3*vx0_r;
va0 = vec_xxpermdi(va0, va0, 2);
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
vy_0 += va1*vx1_r;
vy_1 += va1_1*vx1_r;
vy_2 += va1_2*vx1_r;
vy_3 += va1_3*vx1_r;
va1 = vec_xxpermdi(va1, va1, 2);
va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
vy_0 += va0*vx0_i;
vy_1 += va0_1*vx0_i;
vy_2 += va0_2*vx0_i;
vy_3 += va0_3*vx0_i;
vy_0 += va1*vx1_i;
vy_1 += va1_1*vx1_i;
vy_2 += va1_2*vx1_i;
vy_3 += va1_3*vx1_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
vy[i + 2] = vy_2;
vy[i + 3] = vy_3;
}
}
#else
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
#else
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_4x1_VEC
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register __vector double vx0_r = {x[0], x[0]};
register __vector double vx0_i = {-x[1], x[1]};
#else
register __vector double vx0_r = {x[0], -x[0]};
register __vector double vx0_i = {x[1], x[1]};
#endif
register __vector double *vy = (__vector double *) y;
register __vector double *vptr_a0 = (__vector double *) a0;
for (i = 0; i < n; i += 4) {
register __vector double vy_0 = vy[i];
register __vector double vy_1 = vy[i + 1];
register __vector double vy_2 = vy[i + 2];
register __vector double vy_3 = vy[i + 3];
register __vector double va0 = vptr_a0[i];
register __vector double va0_1 = vptr_a0[i + 1];
register __vector double va0_2 = vptr_a0[i + 2];
register __vector double va0_3 = vptr_a0[i + 3];
vy_0 += va0*vx0_r;
vy_1 += va0_1*vx0_r;
vy_2 += va0_2*vx0_r;
vy_3 += va0_3*vx0_r;
va0 = vec_xxpermdi(va0, va0, 2);
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
vy_0 += va0*vx0_i;
vy_1 += va0_1*vx0_i;
vy_2 += va0_2*vx0_i;
vy_3 += va0_3*vx0_i;
vy[i] = vy_0;
vy[i + 1] = vy_1;
vy[i + 2] = vy_2;
vy[i + 3] = vy_3;
}
}
#else
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
#else
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
#endif
}
}
#endif
#ifdef HAVE_KERNEL_ADDY
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
#if !defined(XCONJ)
register __vector double valpha_r = {alpha_r, alpha_r};
register __vector double valpha_i = {-alpha_i, alpha_i};
#else
register __vector double valpha_r = {alpha_r, -alpha_r};
register __vector double valpha_i = {alpha_i, alpha_i};
#endif
register __vector double *vptr_src = (__vector double *) src;
if (inc_dest != 2) {
register __vector double *vptr_y = (__vector double *) dest;
//note that inc_dest is already 2x. so we should add it to double*
register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
BLASLONG dest_t = 0;
BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times
for (i = 0; i < n; i += 4) {
register __vector double vy_0 = vptr_y[dest_t];
register __vector double vy_1 = vptr_y1[dest_t];
register __vector double vy_2 = vptr_y2[dest_t];
register __vector double vy_3 = vptr_y3[dest_t];
register __vector double vsrc = vptr_src[i];
register __vector double vsrc_1 = vptr_src[i + 1];
register __vector double vsrc_2 = vptr_src[i + 2];
register __vector double vsrc_3 = vptr_src[i + 3];
vy_0 += vsrc*valpha_r;
vy_1 += vsrc_1*valpha_r;
vy_2 += vsrc_2*valpha_r;
vy_3 += vsrc_3*valpha_r;
vsrc = vec_xxpermdi(vsrc, vsrc, 2);
vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
vy_0 += vsrc*valpha_i;
vy_1 += vsrc_1*valpha_i;
vy_2 += vsrc_2*valpha_i;
vy_3 += vsrc_3*valpha_i;
vptr_y[dest_t] = vy_0;
vptr_y1[dest_t ] = vy_1;
vptr_y2[dest_t] = vy_2;
vptr_y3[dest_t] = vy_3;
dest_t += add_dest;
}
return;
} else {
register __vector double *vptr_y = (__vector double *) dest;
for (i = 0; i < n; i += 4) {
register __vector double vy_0 = vptr_y[i];
register __vector double vy_1 = vptr_y[i + 1];
register __vector double vy_2 = vptr_y[i + 2];
register __vector double vy_3 = vptr_y[i + 3];
register __vector double vsrc = vptr_src[i];
register __vector double vsrc_1 = vptr_src[i + 1];
register __vector double vsrc_2 = vptr_src[i + 2];
register __vector double vsrc_3 = vptr_src[i + 3];
vy_0 += vsrc*valpha_r;
vy_1 += vsrc_1*valpha_r;
vy_2 += vsrc_2*valpha_r;
vy_3 += vsrc_3*valpha_r;
vsrc = vec_xxpermdi(vsrc, vsrc, 2);
vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
vy_0 += vsrc*valpha_i;
vy_1 += vsrc_1*valpha_i;
vy_2 += vsrc_2*valpha_i;
vy_3 += vsrc_3*valpha_i;
vptr_y[i] = vy_0;
vptr_y[i + 1 ] = vy_1;
vptr_y[i + 2] = vy_2;
vptr_y[i + 3] = vy_3;
}
return;
}
return;
}
#else
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
if (inc_dest != 2) {
FLOAT temp_r;
FLOAT temp_i;
for (i = 0; i < n; i++) {
#if !defined(XCONJ)
temp_r = alpha_r * src[0] - alpha_i * src[1];
temp_i = alpha_r * src[1] + alpha_i * src[0];
#else
temp_r = alpha_r * src[0] + alpha_i * src[1];
temp_i = -alpha_r * src[1] + alpha_i * src[0];
#endif
*dest += temp_r;
*(dest + 1) += temp_i;
src += 2;
dest += inc_dest;
}
return;
}
FLOAT temp_r0;
FLOAT temp_i0;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT temp_r2;
FLOAT temp_i2;
FLOAT temp_r3;
FLOAT temp_i3;
for (i = 0; i < n; i += 4) {
#if !defined(XCONJ)
temp_r0 = alpha_r * src[0] - alpha_i * src[1];
temp_i0 = alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] - alpha_i * src[3];
temp_i1 = alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] - alpha_i * src[5];
temp_i2 = alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] - alpha_i * src[7];
temp_i3 = alpha_r * src[7] + alpha_i * src[6];
#else
temp_r0 = alpha_r * src[0] + alpha_i * src[1];
temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
temp_r1 = alpha_r * src[2] + alpha_i * src[3];
temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
temp_r2 = alpha_r * src[4] + alpha_i * src[5];
temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
temp_r3 = alpha_r * src[6] + alpha_i * src[7];
temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
#endif
dest[0] += temp_r0;
dest[1] += temp_i0;
dest[2] += temp_r1;
dest[3] += temp_i1;
dest[4] += temp_r2;
dest[5] += temp_i2;
dest[6] += temp_r3;
dest[7] += temp_i3;
src += 8;
dest += 8;
}
return;
}
#endif
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT xbuffer[8], *ybuffer;
if (m < 1) return (0);
if (n < 1) return (0);
ybuffer = buffer;
inc_x *= 2;
inc_y *= 2;
lda *= 2;
n1 = n / 4;
n2 = n % 4;
m3 = m % 4;
m1 = m - (m % 4);
m2 = (m % NBMAX) - (m % 4);
y_ptr = y;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
a_ptr = a;
x_ptr = x;
//zero_y(NB,ybuffer);
memset(ybuffer, 0, NB * 16);
if (inc_x == 2) {
for (i = 0; i < n1; i++) {
zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
a_ptr += lda << 2;
x_ptr += 8;
}
if (n2 & 2) {
zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
x_ptr += 4;
a_ptr += 2 * lda;
}
if (n2 & 1) {
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
x_ptr += 2;
a_ptr += lda;
}
} else {
for (i = 0; i < n1; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
xbuffer[2] = x_ptr[0];
xbuffer[3] = x_ptr[1];
x_ptr += inc_x;
xbuffer[4] = x_ptr[0];
xbuffer[5] = x_ptr[1];
x_ptr += inc_x;
xbuffer[6] = x_ptr[0];
xbuffer[7] = x_ptr[1];
x_ptr += inc_x;
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
a_ptr += lda << 2;
}
for (i = 0; i < n2; i++) {
xbuffer[0] = x_ptr[0];
xbuffer[1] = x_ptr[1];
x_ptr += inc_x;
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
a_ptr += lda;
}
}
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
a += 2 * NB;
y_ptr += NB * inc_y;
}
if (m3 == 0) return (0);
if (m3 == 1) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r = 0.0;
FLOAT temp_i = 0.0;
if (lda == 2 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
#endif
a_ptr += 4;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += 2;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
#else
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
return (0);
}
if (m3 == 2) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
if (lda == 4 && inc_x == 2) {
for (i = 0; i < (n & -2); i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
#endif
a_ptr += 8;
x_ptr += 4;
}
for (; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += 4;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
return (0);
}
if (m3 == 3) {
a_ptr = a;
x_ptr = x;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_i2 = 0.0;
if (lda == 6 && inc_x == 2) {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += 6;
x_ptr += 2;
}
} else {
for (i = 0; i < n; i++) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
#else
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
#endif
a_ptr += lda;
x_ptr += inc_x;
}
}
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
#else
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
#endif
return (0);
}
return (0);
}

847
kernel/power/zgemv_t_4.c Normal file
View File

@@ -0,0 +1,847 @@
/***************************************************************************
Copyright (c) 2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#define NBMAX 4096
#define HAVE_KERNEL_4x4_VEC 1
#define HAVE_KERNEL_4x2_VEC 1
#define HAVE_KERNEL_4x1_VEC 1
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
#include <altivec.h>
#endif
#ifdef HAVE_KERNEL_4x4_VEC_ASM
#elif HAVE_KERNEL_4x4_VEC
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
//p for positive(real*real,image*image) r for image (real*image,image*real)
register __vector double vtemp0_p = {0.0, 0.0};
register __vector double vtemp0_r = {0.0, 0.0};
register __vector double vtemp1_p = {0.0, 0.0};
register __vector double vtemp1_r = {0.0, 0.0};
register __vector double vtemp2_p = {0.0, 0.0};
register __vector double vtemp2_r = {0.0, 0.0};
register __vector double vtemp3_p = {0.0, 0.0};
register __vector double vtemp3_r = {0.0, 0.0};
i = 0;
n = n << 1;
while (i < n) {
// __builtin_prefetch(&x[i]);
// __builtin_prefetch(&a0[i]);
// __builtin_prefetch(&a1[i]);
// __builtin_prefetch(&a2[i]);
// __builtin_prefetch(&a3[i]);
register __vector double vx_0 = *(__vector double*) (&x[i]);
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
register __vector double va0 = *(__vector double*) (&a0[i]);
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
register __vector double va1 = *(__vector double*) (&a1[i]);
register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
register __vector double va2 = *(__vector double*) (&a2[i]);
register __vector double va2_1 = *(__vector double*) (&a2[i + 2]);
register __vector double va2_2 = *(__vector double*) (&a2[i + 4]);
register __vector double va2_3 = *(__vector double*) (&a2[i + 6]);
register __vector double va3 = *(__vector double*) (&a3[i]);
register __vector double va3_1 = *(__vector double*) (&a3[i + 2]);
register __vector double va3_2 = *(__vector double*) (&a3[i + 4]);
register __vector double va3_3 = *(__vector double*) (&a3[i + 6]);
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
i += 8;
vtemp0_p += vx_0*va0;
vtemp0_r += vxr_0*va0;
vtemp1_p += vx_0*va1;
vtemp1_r += vxr_0*va1;
vtemp2_p += vx_0*va2;
vtemp2_r += vxr_0*va2;
vtemp3_p += vx_0*va3;
vtemp3_r += vxr_0*va3;
vtemp0_p += vx_1*va0_1;
vtemp0_r += vxr_1*va0_1;
vtemp1_p += vx_1*va1_1;
vtemp1_r += vxr_1*va1_1;
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
vtemp2_p += vx_1*va2_1;
vtemp2_r += vxr_1*va2_1;
vtemp3_p += vx_1*va3_1;
vtemp3_r += vxr_1*va3_1;
vtemp0_p += vx_2*va0_2;
vtemp0_r += vxr_0*va0_2;
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
vtemp1_p += vx_2*va1_2;
vtemp1_r += vxr_0*va1_2;
vtemp2_p += vx_2*va2_2;
vtemp2_r += vxr_0*va2_2;
vtemp3_p += vx_2*va3_2;
vtemp3_r += vxr_0*va3_2;
vtemp0_p += vx_3*va0_3;
vtemp0_r += vxr_1*va0_3;
vtemp1_p += vx_3*va1_3;
vtemp1_r += vxr_1*va1_3;
vtemp2_p += vx_3*va2_3;
vtemp2_r += vxr_1*va2_3;
vtemp3_p += vx_3*va3_3;
vtemp3_r += vxr_1*va3_3;
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1];
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1];
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1];
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1];
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1];
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1];
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1];
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
#else
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1, *a2, *a3;
a0 = ap;
a1 = ap + lda;
a2 = a1 + lda;
a3 = a2 + lda;
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_r2 = 0.0;
FLOAT temp_r3 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
FLOAT temp_i2 = 0.0;
FLOAT temp_i3 = 0.0;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1];
temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i];
temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1];
temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i];
#else
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1];
temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i];
temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1];
temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
#endif
}
#endif
#ifdef HAVE_KERNEL_4x2_VEC
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
//p for positive(real*real,image*image) r for image (real*image,image*real)
register __vector double vtemp0_p = {0.0, 0.0};
register __vector double vtemp0_r = {0.0, 0.0};
register __vector double vtemp1_p = {0.0, 0.0};
register __vector double vtemp1_r = {0.0, 0.0};
i = 0;
n = n << 1;
while (i < n) {
register __vector double vx_0 = *(__vector double*) (&x[i]);
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
register __vector double va0 = *(__vector double*) (&a0[i]);
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
register __vector double va1 = *(__vector double*) (&a1[i]);
register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
i += 8;
vtemp0_p += vx_0*va0;
vtemp0_r += vxr_0*va0;
vtemp1_p += vx_0*va1;
vtemp1_r += vxr_0*va1;
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
vtemp0_p += vx_1*va0_1;
vtemp0_r += vxr_1*va0_1;
vtemp1_p += vx_1*va1_1;
vtemp1_r += vxr_1*va1_1;
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
vtemp0_p += vx_2*va0_2;
vtemp0_r += vxr_0*va0_2;
vtemp1_p += vx_2*va1_2;
vtemp1_r += vxr_0*va1_2;
vtemp0_p += vx_3*va0_3;
vtemp0_r += vxr_1*va0_3;
vtemp1_p += vx_3*va1_3;
vtemp1_r += vxr_1*va1_3;
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
#else
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0, *a1;
a0 = ap;
a1 = ap + lda;
FLOAT temp_r0 = 0.0;
FLOAT temp_r1 = 0.0;
FLOAT temp_i0 = 0.0;
FLOAT temp_i1 = 0.0;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
#else
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
}
#endif
#ifdef HAVE_KERNEL_4x1_VEC
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0 ;
a0 = ap;
//p for positive(real*real,image*image) r for image (real*image,image*real)
register __vector double vtemp0_p = {0.0, 0.0};
register __vector double vtemp0_r = {0.0, 0.0};
i = 0;
n = n << 1;
while (i < n) {
register __vector double vx_0 = *(__vector double*) (&x[i]);
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
register __vector double va0 = *(__vector double*) (&a0[i]);
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
i += 8;
vtemp0_p += vx_0*va0;
vtemp0_r += vxr_0*va0;
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
vtemp0_p += vx_1*va0_1;
vtemp0_r += vxr_1*va0_1;
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
vtemp0_p += vx_2*va0_2;
vtemp0_r += vxr_0*va0_2;
vtemp0_p += vx_3*va0_3;
vtemp0_r += vxr_1*va0_3;
}
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
#else
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
#endif
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
#else
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
BLASLONG i;
FLOAT *a0;
a0 = ap;
FLOAT temp_r0 = 0.0;
FLOAT temp_i0 = 0.0;
for (i = 0; i < 2 * n; i += 2) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
#else
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
#endif
}
#if !defined(XCONJ)
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
#else
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
#endif
}
#endif
static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
BLASLONG i;
for (i = 0; i < n; i++) {
*dest = *src;
*(dest + 1) = *(src + 1);
dest += 2;
src += inc_src;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
BLASLONG i;
BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
BLASLONG n1;
BLASLONG m1;
BLASLONG m2;
BLASLONG m3;
BLASLONG n2;
FLOAT ybuffer[8], *xbuffer;
if (m < 1) return (0);
if (n < 1) return (0);
inc_x <<= 1;
inc_y <<= 1;
lda <<= 1;
xbuffer = buffer;
n1 = n >> 2;
n2 = n & 3;
m3 = m & 3;
m1 = m - m3;
m2 = (m & (NBMAX - 1)) - m3;
BLASLONG NB = NBMAX;
while (NB == NBMAX) {
m1 -= NB;
if (m1 < 0) {
if (m2 == 0) break;
NB = m2;
}
y_ptr = y;
a_ptr = a;
x_ptr = x;
if (inc_x != 2)
copy_x(NB, x_ptr, xbuffer, inc_x);
else
xbuffer = x_ptr;
if (inc_y == 2) {
for (i = 0; i < n1; i++) {
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda << 2;
y_ptr += 8;
}
if (n2 & 2) {
zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda << 1;
y_ptr += 4;
}
if (n2 & 1) {
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
a_ptr += lda;
y_ptr += 2;
}
} else {
for (i = 0; i < n1; i++) {
memset(ybuffer, 0, sizeof (ybuffer));
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
a_ptr += lda << 2;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
y_ptr[0] += ybuffer[2];
y_ptr[1] += ybuffer[3];
y_ptr += inc_y;
y_ptr[0] += ybuffer[4];
y_ptr[1] += ybuffer[5];
y_ptr += inc_y;
y_ptr[0] += ybuffer[6];
y_ptr[1] += ybuffer[7];
y_ptr += inc_y;
}
for (i = 0; i < n2; i++) {
memset(ybuffer, 0, sizeof (ybuffer));
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
a_ptr += lda;
y_ptr[0] += ybuffer[0];
y_ptr[1] += ybuffer[1];
y_ptr += inc_y;
}
}
a += 2 * NB;
x += NB * inc_x;
}
if (m3 == 0) return (0);
x_ptr = x;
j = 0;
a_ptr = a;
y_ptr = y;
if (m3 == 3) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
x_ptr += inc_x;
FLOAT x4 = x_ptr[0];
FLOAT x5 = x_ptr[1];
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 2) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
x_ptr += inc_x;
FLOAT x2 = x_ptr[0];
FLOAT x3 = x_ptr[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
if (m3 == 1) {
FLOAT temp_r;
FLOAT temp_i;
FLOAT temp_r1;
FLOAT temp_i1;
FLOAT x0 = x_ptr[0];
FLOAT x1 = x_ptr[1];
while (j < (n & -2)) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
a_ptr += lda;
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
y_ptr += inc_y;
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
#endif
a_ptr += lda;
y_ptr += inc_y;
j += 2;
}
while (j < n) {
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
#else
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
#endif
#if !defined(XCONJ)
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
#else
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
#endif
a_ptr += lda;
y_ptr += inc_y;
j++;
}
return (0);
}
return (0);
}

265
kernel/power/zrot.c Normal file
View File

@@ -0,0 +1,265 @@
/***************************************************************************
Copyright (c) 2018, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
{
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__vector double t6;
__vector double t7;
__asm__
(
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
"addi %[x_ptr], %[x_ptr], 64 \n\t"
"addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t"
"ble 2f \n\t"
".p2align 5 \n"
"1: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
"xvmuldp 42, 34, 36 \n\t"
"xvmuldp 43, 35, 36 \n\t"
"xvmuldp %x[x0], 48, 36 \n\t" // c * y
"xvmuldp %x[x1], 49, 36 \n\t"
"xvmuldp %x[x2], 50, 36 \n\t"
"xvmuldp %x[x3], 51, 36 \n\t"
"xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 45, 33, 37 \n\t"
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
"lxvd2x 33, %[i16],%[x_ptr] \n\t"
"xvmuldp 46, 34, 37 \n\t"
"xvmuldp 47, 35, 37 \n\t"
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
"xvmuldp %x[x4], 48, 37 \n\t" // s * y
"xvmuldp %x[x5], 49, 37 \n\t"
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
"xvmuldp %x[x6], 50, 37 \n\t"
"xvmuldp %x[x7], 51, 37 \n\t"
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
"xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y
"xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y
"addi %[x_ptr], %[x_ptr], -64 \n\t"
"addi %[y_ptr], %[y_ptr], -64 \n\t"
"xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y
"xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y
"xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
"xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x
"xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x
"xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
"stxvd2x %x[x1], %[i16], %[y_ptr] \n\t"
"stxvd2x %x[x2], %[i32], %[y_ptr] \n\t"
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t"
"addi %[x_ptr], %[x_ptr], 128 \n\t"
"addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t"
"bgt+ 1b \n"
"2: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
"xvmuldp 42, 34, 36 \n\t"
"xvmuldp 43, 35, 36 \n\t"
"xvmuldp %x[x0], 48, 36 \n\t" // c * y
"xvmuldp %x[x1], 49, 36 \n\t"
"xvmuldp %x[x2], 50, 36 \n\t"
"xvmuldp %x[x3], 51, 36 \n\t"
"xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 45, 33, 37 \n\t"
"xvmuldp 46, 34, 37 \n\t"
"xvmuldp 47, 35, 37 \n\t"
"xvmuldp %x[x4], 48, 37 \n\t" // s * y
"xvmuldp %x[x5], 49, 37 \n\t"
"xvmuldp %x[x6], 50, 37 \n\t"
"xvmuldp %x[x7], 51, 37 \n\t"
"addi %[x_ptr], %[x_ptr], -64 \n\t"
"addi %[y_ptr], %[y_ptr], -64 \n\t"
"xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y
"xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y
"xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y
"xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y
"xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
"xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x
"xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x
"xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
"stxvd2x %x[x1], %[i16], %[y_ptr] \n\t"
"stxvd2x %x[x2], %[i32], %[y_ptr] \n\t"
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t"
:
[mem_x] "+m" (*(double (*)[2*n])x),
[mem_y] "+m" (*(double (*)[2*n])y),
[temp_n] "+&r" (n),
[x_ptr] "+&b"(x), [y_ptr] "+&b"(y),
[x0] "=wa" (t0),
[x1] "=wa" (t1),
[x2] "=wa" (t2),
[x3] "=wa" (t3),
[x4] "=wa" (t4),
[x5] "=wa" (t5),
[x6] "=wa" (t6),
[x7] "=wa" (t7)
:
[cos] "d" (cosA),
[sin] "d" (sinA),
[i16] "b" (16),
[i32] "b" (32),
[i48] "b" (48)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
);
return;
}
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
FLOAT temp[2];
BLASLONG inc_x2;
BLASLONG inc_y2;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -4;
if ( n1 > 0 )
{
zrot_kernel_4(n1, x, y, c, s);
i=n1;
ix=2*n1;
}
while(i < n)
{
temp[0] = c*x[ix] + s*y[ix] ;
temp[1] = c*x[ix+1] + s*y[ix+1] ;
y[ix] = c*y[ix] - s*x[ix] ;
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += 2 ;
i++ ;
}
}
else
{
inc_x2 = 2 * inc_x ;
inc_y2 = 2 * inc_y ;
while(i < n)
{
temp[0] = c*x[ix] + s*y[iy] ;
temp[1] = c*x[ix+1] + s*y[iy+1] ;
y[iy] = c*y[iy] - s*x[ix] ;
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
x[ix] = temp[0] ;
x[ix+1] = temp[1] ;
ix += inc_x2 ;
iy += inc_y2 ;
i++ ;
}
}
return(0);
}

View File

@@ -647,7 +647,9 @@ static int get_l2_size_old(void){
return 6144;
}
}
return 0;
// return 0;
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
return 256;
}
#endif
@@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){
l2 = BITMASK(ecx, 16, 0xffff);
#ifndef ARCH_X86
if (l2 <= 0) {
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
return 256;
}
return l2;
#else
@@ -871,6 +877,22 @@ static void init_parameter(void) {
#endif
#endif
#ifdef SKYLAKEX
#ifdef DEBUG
fprintf(stderr, "SkylakeX\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef OPTERON

View File

@@ -169,7 +169,7 @@ ifndef ZDOTKERNEL
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = ../arm/dot.c
DSDOTKERNEL = ../generic/dot.c
# Bug in znrm2 assembler kernel
ifndef ZNRM2KERNEL

View File

@@ -1,3 +1 @@
include $(KERNELDIR)/KERNEL.PENRYN
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c

View File

@@ -138,6 +138,14 @@
/* INCX != 1 or INCY != 1 */
.L14:
cmpl $0, %ebx
jne .L141
cmpl $0, %ecx
jne .L141
/* INCX == 0 and INCY == 0 */
jmp .L27
.L141:
movl %edx, %eax
sarl $2, %eax
jle .L28

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -62,7 +62,7 @@
#define PREFETCHSIZE (8 * 21 + 4)
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht0
#define PREFETCHSIZE (8 * 21 + 4)
#endif

View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@@ -63,7 +63,7 @@
#define PREFETCHSIZE 84
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@@ -61,7 +61,7 @@
#define PREFETCHSIZE 84
#endif
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
#define PREFETCH prefetcht1
#define PREFETCHSIZE 84
#endif

View File

@@ -0,0 +1,19 @@
include $(KERNELDIR)/KERNEL.HASWELL
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
DTRMMKERNEL = ../generic/trmmkernel_16x2.c
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c

View File

@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "caxpy_microk_steamroller-2.c"
#elif defined(BULLDOZER)
#include "caxpy_microk_bulldozer-2.c"
#elif defined(HASWELL) || defined(ZEN)
#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
#include "caxpy_microk_haswell-2.c"
#elif defined(SANDYBRIDGE)
#include "caxpy_microk_sandy-2.c"

View File

@@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
#endif
".align 16 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
@@ -70,7 +70,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part
"vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t"
"vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t"
"vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t"
@@ -96,7 +96,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t"
"vmovups %%ymm5 , (%3,%0,4) \n\t"
".align 2 \n\t"
".p2align 1 \n\t"
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
"vmovups %%ymm11, 96(%3,%0,4) \n\t"

Some files were not shown because too many files have changed in this diff Show More