Compare commits

..

285 Commits

Author SHA1 Message Date
Martin Kroeker
898212efcd Actually add the message to the TLS section 2021-08-02 14:50:14 +02:00
Martin Kroeker
210a1584c5 Rebase source and edit TLS version of the message as well 2021-08-02 14:19:16 +02:00
Martin Kroeker
f2a7a67f5a Improve the "tried to allocate too many buffers" error message 2021-07-31 17:23:40 +02:00
Martin Kroeker
e0e88f9edc Merge pull request #3329 from martin-frbg/issue3272
Work around gcc11+ miscompiling C/ZBLAS3 tests at -O3
2021-07-30 20:39:38 +02:00
Martin Kroeker
5dc6aa74f0 Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 2021-07-30 14:46:19 +02:00
Martin Kroeker
e78fbe4654 Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 2021-07-30 14:44:54 +02:00
Martin Kroeker
b4f4ed378b Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 2021-07-30 14:21:08 +02:00
Martin Kroeker
cbc41973fd Disable gfortran tree vectorizer to avoid gcc11+ miscompilation at O3 2021-07-30 14:20:12 +02:00
Martin Kroeker
1b6db3dbba Merge pull request #3327 from h-vetinari/lapack597_redux
Complete the carry of lapack PR 597
2021-07-28 23:04:02 +02:00
Martin Kroeker
f681553c6a Merge pull request #3326 from wattoc/develop
Include Haiku in processor count checks
2021-07-28 23:03:37 +02:00
Martin Kroeker
afadeeba2a Merge pull request #3325 from gxw-loongson/develop
Add support for LOONGARCH64
2021-07-28 23:03:15 +02:00
Isuru Fernando
02d4a49761 Also make sure the 1 is INTEGER*4 for OMP_SET_NUM_THREADS 2021-07-27 23:44:51 +02:00
Craig Watson
4d7dfe4845 Include Haiku in processor count checks 2021-07-27 09:00:30 +00:00
gxw
af0a69f355 Add support for LOONGARCH64 2021-07-27 15:29:12 +08:00
Martin Kroeker
5a2fe5bfb9 Merge pull request #3323 from martin-frbg/issue3322
GCC did not support -mtune for ARM64 before 5.1
2021-07-23 22:46:02 +02:00
Martin Kroeker
342d3e8b5c Merge pull request #3314 from martin-frbg/lapack597
Fix LAPACK testsuite compatibility with libomp (Reference-LAPACK PR 597)
2021-07-23 15:30:27 +02:00
Martin Kroeker
efbd7c7840 GCC did not support -mtune for ARM64 before 5.1 2021-07-23 13:42:52 +02:00
Martin Kroeker
3a7955cd93 Merge pull request #3320 from martin-frbg/issue3318
Empirical workaround for numpy SVD NaN problem from issue 3318
2021-07-22 21:28:50 +02:00
Martin Kroeker
47ba85f314 Fix regex to match kernels suffixed with cpuname too 2021-07-22 17:24:15 +02:00
Martin Kroeker
30f23be0f9 Rework setting of -mfma to only apply it where necessary 2021-07-22 12:00:03 +02:00
Martin Kroeker
49bbf330ca Empirical workaround for numpy SVD NaN problem from issue 3318 2021-07-18 22:19:19 +02:00
Martin Kroeker
38d5b4b124 Update version to 0.3.17.dev 2021-07-15 15:00:01 +02:00
Martin Kroeker
6e3fbe8ac5 Update version to 0.3.17.dev 2021-07-15 14:59:15 +02:00
Martin Kroeker
86273392e5 Merge pull request #3317 from xianyi/release-0.3.0
merge 0.3.17 back into develop to copy tag
2021-07-15 14:58:20 +02:00
Martin Kroeker
d909f9f3d4 Update version to 0.3.17 2021-07-15 14:52:54 +02:00
Martin Kroeker
12d3d94e2e Merge pull request #3316 from xianyi/develop
Merge develop for bugfix release 0.3.17
2021-07-15 14:51:50 +02:00
Martin Kroeker
f349be3bdb Merge branch 'release-0.3.0' into develop 2021-07-15 14:50:20 +02:00
Martin Kroeker
4777eb678f Update version to 0.3.17 2021-07-15 14:46:24 +02:00
Martin Kroeker
415876d117 Merge pull request #3315 from martin-frbg/changelog0317
Update Changelog for 0.3.17
2021-07-15 14:44:59 +02:00
Martin Kroeker
da8435dc36 Update Changelog for 0.3.17 2021-07-15 14:44:17 +02:00
Martin Kroeker
4c7065f3ee Merge pull request #3313 from martin-frbg/3266-2
Remove BLASLONG casts from SPARC parameter entries
2021-07-15 08:00:57 +02:00
Martin Kroeker
f62bfaafe8 Merge pull request #3312 from martin-frbg/revert_3260
Temporarily disable the SkylakeX sgemv_t microkernel
2021-07-15 08:00:34 +02:00
Martin Kroeker
d947116390 Merge pull request #3311 from martin-frbg/issue3309
Revert PR #3250 (shortcut without buffer allocation) as it is unsafe …
2021-07-15 07:58:47 +02:00
Martin Kroeker
f176ff90af Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp 2021-07-14 22:42:43 +02:00
Martin Kroeker
f4d4abd423 Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp 2021-07-14 22:41:45 +02:00
Martin Kroeker
2b9443b7e7 Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp 2021-07-14 22:40:29 +02:00
Martin Kroeker
fe0e66564e Declare N_THREADS as *4 for compatibility of INTERFACE64 builds with LLVM libomp 2021-07-14 22:39:20 +02:00
Martin Kroeker
a6351e32f0 Remove BLASLONG casts from SPARC entries
in response to https://github.com/xianyi/OpenBLAS/pull/3266#issuecomment-878637675
2021-07-14 21:09:36 +02:00
Martin Kroeker
5b4b385ecf Temporarily disable the SkylakeX sgemv_t microkernel due to LAPACK testsuite failures 2021-07-14 20:50:14 +02:00
Martin Kroeker
1dea57ab25 Revert PR #3250 (shortcut without buffer allocation) as it is unsafe on some x86_64 2021-07-14 20:32:57 +02:00
Martin Kroeker
54ffe280df Merge pull request #3310 from jeromerobert/develop
Avoid redefinition of _GNU_SOURCE
2021-07-14 16:06:13 +02:00
Jerome Robert
029d1e16b9 Avoid redefinition of _GNU_SOURCE
* _GNU_SOURCE may have been set by the application and redefinition
  trigger warnings or error with -Werror
* Fix for 220f6a1c5
2021-07-14 12:21:58 +02:00
Martin Kroeker
ea8e208029 Merge pull request #3306 from jonaszhou1/develop
Add cpu detection support for Zhaoxin processors
2021-07-12 16:30:10 +02:00
JonasZhou
0fca36c8c3 Add cpu detection support for Zhaoxin processors
Signed-off-by: JonasZhou <JonasZhou@zhaoxin.com>
2021-07-12 13:43:45 +08:00
Martin Kroeker
44cc7cdecc Update version to 0.3.16.dev 2021-07-12 00:16:59 +02:00
Martin Kroeker
6492131792 Update version to 0.3.16.dev 2021-07-12 00:16:29 +02:00
Martin Kroeker
6c8ec55fb7 Merge pull request #3305 from xianyi/release-0.3.0
Merge release branch back into develop to copy tag
2021-07-12 00:15:52 +02:00
Martin Kroeker
fab746240c Merge pull request #3304 from xianyi/develop
Merge develop int0 0.3.0 for release 0.3.16
2021-07-12 00:12:52 +02:00
Martin Kroeker
847607c768 Merge branch 'release-0.3.0' into develop 2021-07-12 00:12:25 +02:00
Martin Kroeker
4c81d1c3fe Update version to 0.3.16 2021-07-12 00:09:35 +02:00
Martin Kroeker
db4908ebfa Update version to 0.3.16 2021-07-12 00:08:55 +02:00
Martin Kroeker
ed3eb18cb2 Merge pull request #3303 from martin-frbg/changelog16
Update Changelog for 0.3.16
2021-07-11 23:50:02 +02:00
Martin Kroeker
239ff330f8 Update Changelog for 0.3.16 2021-07-11 23:48:39 +02:00
Martin Kroeker
19c81a07cb Merge pull request #3300 from martin-frbg/AzureAlpine
Move Alpine Linux build job from Travis to Azure
2021-07-11 22:50:20 +02:00
Martin Kroeker
e008646ba9 Merge pull request #3302 from martin-frbg/small_cleanup
Clean up some warnings
2021-07-11 22:26:41 +02:00
Martin Kroeker
498479b13e Update azure-pipelines.yml 2021-07-11 18:29:17 +02:00
Martin Kroeker
b4cbfe6677 Update azure-pipelines.yml 2021-07-11 18:08:30 +02:00
Martin Kroeker
be1a42507c Merge pull request #3297 from outerpassage/develop
fix compilation with musl libc
2021-07-11 17:10:20 +02:00
Martin Kroeker
7bb59fceb7 Clean up some warnings 2021-07-11 16:00:29 +02:00
Martin Kroeker
eba2cd951e Revert addition of test_install 2021-07-11 14:38:49 +02:00
Martin Kroeker
836c7fb9f5 Revert addition of test_install target 2021-07-11 14:37:38 +02:00
Martin Kroeker
d2693eac04 Update azure-pipelines.yml 2021-07-11 11:54:02 +02:00
Martin Kroeker
8acb6fe3a8 Update azure-pipelines.yml 2021-07-11 11:29:52 +02:00
Martin Kroeker
c47e35acee Update azure-pipelines.yml 2021-07-11 09:38:48 +02:00
Martin Kroeker
a27a61bb9a Update azure-pipelines.yml 2021-07-11 08:24:20 +02:00
Martin Kroeker
69560ad3ce Update azure-pipelines.yml 2021-07-11 07:25:07 +02:00
Martin Kroeker
b2319fd97a Merge pull request #3301 from martin-frbg/syr2bench
Handle OPENBLAS_LOOPS in SYR2 benchmark
2021-07-11 07:20:19 +02:00
Martin Kroeker
0266ba7cb6 Update azure-pipelines.yml 2021-07-10 23:21:58 +02:00
Martin Kroeker
7e09570e04 Update azure-pipelines.yml 2021-07-10 22:41:49 +02:00
Martin Kroeker
14e33e0f7e Handle OPENBLAS_LOOPS in SYR2 benchmark 2021-07-10 21:27:53 +02:00
Martin Kroeker
db57c449dc Update azure-pipelines.yml 2021-07-10 20:57:21 +02:00
Martin Kroeker
993e56b7b3 Merge pull request #3299 from martin-frbg/issue3298
Fix copy-paste error in LIBCORE assignment for Tiger Lake
2021-07-10 20:48:53 +02:00
Martin Kroeker
c9304199cf Update azure-pipelines.yml 2021-07-10 20:12:33 +02:00
Martin Kroeker
d86290edf0 add sudo for install in Alpine 2021-07-10 19:52:04 +02:00
Martin Kroeker
89429fdaa2 fix typo 2021-07-10 19:03:42 +02:00
Martin Kroeker
d511063098 Move Alpine Linux build job from Travis to Azure 2021-07-10 18:52:44 +02:00
Martin Kroeker
4f4e286bf6 Fix copy-paste error in LIBCORE assignment for Tiger Lake 2021-07-10 18:20:40 +02:00
River Dillon
ddb6cee0d5 Contribution note 2021-07-10 01:34:47 -07:00
River Dillon
cecc2c65aa Add test of installed <openblas_config.h> 2021-07-10 01:26:05 -07:00
River Dillon
220f6a1c55 Add feature test macro for proper inclusion of <sched.h> 2021-07-10 00:38:02 -07:00
River Dillon
2f6326a630 Remove <linux/unistd.h> 2021-07-10 00:36:07 -07:00
Martin Kroeker
c0d0406b97 Merge pull request #3296 from martin-frbg/issue3295
Support Zhaoxin/Centaur family 7 processors as Nehalem
2021-07-08 21:24:15 +02:00
Martin Kroeker
8f22ac552b Add vendor string Shanghai as successor to Centaur 2021-07-08 18:28:49 +02:00
Martin Kroeker
da623ae838 Add vendor string Shanghai as the successor to Centaur 2021-07-08 18:26:23 +02:00
Martin Kroeker
eb2fdd3af0 Recognize newer Zhaoxin/Centaur processors as Nehalem 2021-07-08 12:23:15 +02:00
Martin Kroeker
0d8d261dd4 Recognize newer Zhaoxin/Centaur cpus as Nehalem 2021-07-08 12:20:19 +02:00
Martin Kroeker
40caaef052 Merge pull request #3265 from TAAPArthur/improve_portability
Removed use of non portable '-p' arg to install
2021-07-07 20:58:29 +02:00
Martin Kroeker
25b602d8a6 Merge pull request #3293 from martin-frbg/issue3290
Enable (C)EXTRALIB as for any other platform when building the tests on RISCV C910V
2021-07-07 20:46:54 +02:00
Martin Kroeker
4ed99c2ce3 Merge pull request #3292 from martin-frbg/syrk_limit
Add lower limit for multithreading in xSYRK
2021-07-07 20:46:28 +02:00
Martin Kroeker
f20c4edc33 Merge pull request #3288 from martin-frbg/getrf-2
Add lower threshold for multithreading in ?GETRF
2021-07-07 20:45:57 +02:00
Martin Kroeker
3cfdb1770c Remove code that disabled EXTRALIB on RISCV C910V 2021-07-06 20:21:07 +02:00
Martin Kroeker
8186963d8c Add lower limit for multithreading 2021-07-04 17:00:26 +02:00
Martin Kroeker
a4543e4918 Handle OPENBLAS_LOOP 2021-07-04 16:59:43 +02:00
Martin Kroeker
2376aa1e8c Merge pull request #3289 from martin-frbg/issue3283
Update README to mention availability of the Windows binaries in the Releases section
2021-07-02 00:19:06 +02:00
Martin Kroeker
4620f98812 Mention availability of the Windows binaries in the Releases section 2021-07-01 19:24:35 +02:00
Martin Kroeker
726c44242b Add lower threshold for multithreading 2021-07-01 17:41:05 +02:00
Martin Kroeker
dcfc5cf714 Handle OPENBLAS_LOOPS for more stable results 2021-07-01 17:39:37 +02:00
Martin Kroeker
06e3b07ecb Handle OPENBLAS_LOOPS and OPENBLAS_TEST options 2021-07-01 17:38:45 +02:00
Martin Kroeker
623be6600a Merge pull request #3284 from martin-frbg/potrf_potri
Add lower thresholds for multithreading in POTRF/POTRI and improve the related benchmark
2021-06-30 07:42:45 +02:00
Martin Kroeker
7ddc9d384c Merge pull request #3287 from martin-frbg/appveyor-conda
Work around current conda/tqdm auto-update problem on Appveyor
2021-06-29 20:09:26 +02:00
Martin Kroeker
6ebcce229f Work around current conda/tqdm auto-update problem 2021-06-29 17:17:34 +02:00
Martin Kroeker
1b5620b66e Add lower threshold for multithreading in ?potrf and ?potri 2021-06-26 23:47:41 +02:00
Martin Kroeker
1f8bda71b9 Add OPENBLAS_LOOPS support to potrf/potrs/potri benchmark 2021-06-26 23:46:00 +02:00
Martin Kroeker
3be660c000 Add interface declarations for ?potri 2021-06-26 23:44:56 +02:00
Martin Kroeker
1a8b6134c2 Merge pull request #3278 from brada4/A55
Add CORTEXA55 cpuid 0xd05 support
2021-06-23 13:05:17 +02:00
Martin Kroeker
f0b822a709 Update cpuid_arm64.c 2021-06-23 10:11:01 +02:00
User User-User
130327e9af OK 2021-06-22 23:58:59 +02:00
User User-User
750719528a bugz 2021-06-20 16:40:43 +02:00
User User-User
91e2b11d3c add to cmake listings too 2021-06-20 15:32:42 +02:00
User User-User
548aa522e5 remove misplaced file 2021-06-20 15:29:25 +02:00
User User-User
6423b282a1 dynamic_arch 2021-06-20 14:19:41 +02:00
User User-User
9335d42740 add gcc8 version matching 2021-06-19 22:21:39 +02:00
User User-User
39ef0880ae copy conf 2021-06-19 21:49:58 +02:00
User User-User
b7da75e4fd WiP CORTEX A55 support 2021-06-19 21:37:51 +02:00
Martin Kroeker
a7627c5afd Merge pull request #3276 from martin-frbg/issue3274
Add workaround for another macro name collision with Windows 10 SDK winnt.h
2021-06-16 16:37:30 +02:00
Martin Kroeker
9499ab0d45 Merge pull request #3275 from martin-frbg/lapack580
Fix missing EXTERNAL declarations in LAPACK TESTING (LAPACK PR 580)
2021-06-16 13:41:38 +02:00
Martin Kroeker
307c4c0786 Fix typo 2021-06-16 13:41:16 +02:00
Martin Kroeker
e83df93975 Work around another recent macro name collision with winnt.h 2021-06-16 12:32:34 +02:00
Martin Kroeker
13fa9f737d Modify defines for CR and RC to work around name collision on Windows 2021-06-16 12:17:25 +02:00
Martin Kroeker
5958ffc9b6 Declare DZASUM as EXTERNAL 2021-06-16 09:43:39 +02:00
Martin Kroeker
cd0e4aadb1 Declare ZDROT as EXTERNAL 2021-06-16 09:41:18 +02:00
Martin Kroeker
e2621ef93a Declare SROT as EXTERNAL 2021-06-16 09:40:15 +02:00
Martin Kroeker
9e1b43ea9b Declare DROT as EXTERNAL 2021-06-16 09:39:28 +02:00
Martin Kroeker
5269348178 Declare CSROT as EXTERNAL 2021-06-16 09:35:12 +02:00
Martin Kroeker
92e024bbb3 Declare SCASUM as EXTERNAL 2021-06-16 09:33:23 +02:00
Martin Kroeker
c4b464cac6 Merge pull request #3273 from austinpagan/sbgemm_gcc10_fix
Power10: Fix for SBGEMM
2021-06-15 22:58:48 +02:00
Gordon Fossum
e6dd44d989 Power10: Fix for SBGEMM
While testing bfloat16 sbgemm kernel, there are some failures for odd value inputs due to updating result for
additional bytes.
2021-06-15 13:07:47 -05:00
Martin Kroeker
baf03a0937 Merge pull request #3252 from martin-frbg/more_shortcuts
Further shortcuts for (small) cases that do not need buffer allocation
2021-06-15 16:14:20 +02:00
Martin Kroeker
7aab5e826c Merge pull request #3250 from martin-frbg/gemv-shortcut
Add shortcut for small-size S/D GEMV_N with increments of one
2021-06-15 14:50:14 +02:00
Martin Kroeker
29417adf4c Merge pull request #3270 from ggouaillardet/topic/dznrm2_tx2
arm64: add the missing d9 register to the clobber list
2021-06-14 13:00:33 +02:00
Gilles Gouaillardet
9d292d37b2 arm64: add the missing d9 register to the clobber list
Refs. numpy/numpy#18422

Signed-off-by: Gilles Gouaillardet <gilles@rist.or.jp>
2021-06-14 17:01:28 +09:00
Martin Kroeker
2e8ff4a781 Merge pull request #3266 from martin-frbg/powerparam
Remove spurious casts from PPC parameters and fix compilation for older targets
2021-06-10 18:05:47 +02:00
Martin Kroeker
dbba381dc3 Merge pull request #3260 from intelmy/sgemv_t_opt
Optimized sgemv_t for small N based on AVX512
2021-06-10 16:08:24 +02:00
Martin Kroeker
f61991d439 Merge pull request #3264 from RajalakshmiSR/sbgemmp10
POWER10: Fixes for sbgemm kernel
2021-06-10 16:07:47 +02:00
Martin Kroeker
efdbdd8f82 Add prefetch values for power3 2021-06-10 11:20:29 +02:00
Martin Kroeker
3906ef3b0f Add prefetch values for power3 2021-06-10 11:19:40 +02:00
Martin Kroeker
8adf0971d8 Add prefetch values for power3 2021-06-10 11:18:22 +02:00
Martin Kroeker
08e2e60762 Add prefetch values for power3 2021-06-10 11:17:33 +02:00
Martin Kroeker
fb9e678235 Fix caxpy/zaxpy for big-endian 2021-06-10 11:15:48 +02:00
Martin Kroeker
dc4fcb48df Fix inverted conditional for caxpy/zaxpy 2021-06-10 11:14:03 +02:00
Martin Kroeker
7a48247761 fix c/zrot and sgemv for POWER5 2021-06-10 11:11:56 +02:00
Martin Kroeker
7dfc45e840 Remove casts for PPC/POWER and complete parameters for POWER3/4 2021-06-10 11:09:50 +02:00
Arthur Williams
7fb6e576c2 Removed use of non portable '-p' arg to install
Not all versions of install support '-p' flag and it isn't worth failing
the build in the installed files' timestamps get updated.
2021-06-09 20:50:36 -05:00
Rajalakshmi Srinivasaraghavan
cbb70438df POWER10: Fixes for sbgemm kernel
While testing bfloat16 sbgemm kernel, there are some failures
for odd value inputs due to array access beyond the boundary.
2021-06-09 12:20:09 -05:00
Ma, Yu
706a08d4a0 Optimized sgemv_t for small N based on AVX512 2021-06-08 15:08:28 -04:00
Zhang Xianyi
9f3d903817 Merge pull request #3259 from zhaofengli/riscv64-fixes
riscv64 fixes
2021-06-08 16:26:56 +08:00
Zhaofeng Li
590be3fae3 riscv64: Add Makefile 2021-06-07 22:55:56 +00:00
Zhaofeng Li
3521cd48cb RISCV64_GENERIC: Use generic kernel for DSDOT for better precision
The implementation in `riscv64/dot.c` fails the `test_dsdot` test, and
the generic kernel seems to have better precision. Tested on SiFive
FU740 (HiFive Unmatched) and QEMU.

Also see #1469.
2021-06-07 22:50:23 +00:00
Zhaofeng Li
1e0192a5cc riscv64/imin: Fix wrong comparison
Same as #1990.
2021-06-07 22:49:39 +00:00
Martin Kroeker
fe9aff17fe Merge pull request #3258 from martin-frbg/hbaction
revert "try to work around gcc update problems" in Homebrew workflow
2021-06-06 22:15:29 +02:00
Martin Kroeker
8c25b440a0 revert "try to work around gcc update problems"
...as homebrew has dropped at least gcc8 now
2021-06-06 19:17:36 +02:00
Martin Kroeker
f84197c1a7 Add shortcuts for (small) cases that do not need expensive buffer allocation 2021-05-29 22:28:00 +02:00
Martin Kroeker
734bd265a8 revert symv changes for now 2021-05-29 15:40:03 +02:00
Martin Kroeker
1217eb910d Fix copy-paste errors in variables used 2021-05-28 09:38:48 +02:00
Martin Kroeker
d6d7a6685d Add shortcuts for (small) cases that do not need expensive buffer allocation 2021-05-27 22:39:18 +02:00
Martin Kroeker
f0e7345fb8 Add shortcut for small-size gemv_n with increments of one 2021-05-26 22:02:34 +02:00
Martin Kroeker
42f048cf6c Merge pull request #3249 from MikaelUrankar/develop
Fix typo
2021-05-26 15:26:30 +02:00
MikaelUrankar
4fbc0777f4 Fix typo 2021-05-26 12:14:57 +02:00
Martin Kroeker
d7472606d5 Merge pull request #3244 from martin-frbg/issue3237
Add fast path for small xSYR with INCX==1
2021-05-22 22:38:09 +02:00
Martin Kroeker
03297ff9f0 Add fast path for small xSYR with INCX==1 2021-05-22 20:41:18 +02:00
Martin Kroeker
2d8d0af0ea Merge pull request #3243 from martin-frbg/lapack564
Fix spurious error exit test failures in the ?chktsqr tests (LAPACK564)
2021-05-22 19:25:56 +02:00
Martin Kroeker
5f677e782e Merge pull request #3196 from guowangy/skylakex-gemm-batch-k
GEMM: skylake: improve the performance when m is small
2021-05-22 19:25:28 +02:00
Martin Kroeker
04c60cee5d Merge pull request #3242 from martin-frbg/issue3239
Handle inadvertent use of DYNAMIC_ARCH=0
2021-05-22 19:24:46 +02:00
Martin Kroeker
3a53207cc9 Fix spurious error exit test failures in the ?chktsqr tests (LAPACK564) 2021-05-22 14:29:45 +02:00
Martin Kroeker
0e73d20629 Handle inadvertent use of DYNAMIC_ARCH=0 2021-05-22 14:23:49 +02:00
Martin Kroeker
02087a62e7 Merge pull request #3205 from intelmy/sgemv_n_opt
optimize on sgemv_n for small n
2021-05-17 17:49:01 +02:00
Martin Kroeker
03b4d79a7e Merge pull request #3238 from martin-frbg/lapack555
Correct function name in error message from SLASQ2 (LAPACK PR555)
2021-05-17 17:32:23 +02:00
Martin Kroeker
5c729c6dce Correct function name in error message from SLASQ2 (Reference-LAPACK PR 555) 2021-05-17 14:47:14 +02:00
Martin Kroeker
e1911b2e60 Merge pull request #3236 from martin-frbg/issue3234
Add -lm for FreeBSD on ARM/ARM64
2021-05-16 17:17:18 +02:00
Martin Kroeker
8f33da4f94 Merge pull request #3235 from dnoan/develop
Update Makefile.arm64
2021-05-16 17:15:45 +02:00
Martin Kroeker
26ccf643a3 Add -lm for FreeBSD on ARM/ARM64 2021-05-16 13:04:38 +02:00
Noan
32264ba496 Update Makefile.arm64
Added -march and -mtune flags for EMAG processors when GCC 9 or later
2021-05-16 09:49:13 +00:00
Martin Kroeker
4ecf631f95 Merge pull request #3228 from martin-frbg/issue3226
filter out -mavx flag on Sandybridge zgemm/ztrmm kernels
2021-05-15 09:06:12 +02:00
Martin Kroeker
5af510081d Merge pull request #3233 from martin-frbg/issue3230
Add autodetection for Intel Ice Lake SP
2021-05-15 01:04:09 +02:00
Martin Kroeker
164551d5a2 Merge pull request #3232 from martin-frbg/lapack553
Reduce stack size requirements in the LAPACK LIN tests (LAPACK PR 553)
2021-05-14 23:28:45 +02:00
Martin Kroeker
310b76aad7 Merge pull request #3231 from martin-frbg/issue3227
Support compilation with pre-C99 versions of MSVC
2021-05-14 23:28:06 +02:00
Martin Kroeker
c4da892ba0 Only filter out -mavx on Sandybridge ZGEMM/ZTRMM kernels 2021-05-14 23:19:10 +02:00
Martin Kroeker
cbfd3c87e1 Recognize Intel Ice Lake SP as Cooper Lake 2021-05-14 20:44:06 +02:00
Martin Kroeker
26e87ac517 Support Intel Ice Lake SP as Cooper Lake 2021-05-14 20:39:55 +02:00
Martin Kroeker
15b9d6b4a7 Delete zchkaa.f 2021-05-14 19:55:31 +02:00
Martin Kroeker
f7bcd962c1 Delete schkaa.f 2021-05-14 19:54:54 +02:00
Martin Kroeker
93cc066921 Delete dchkaa.f 2021-05-14 19:54:13 +02:00
Martin Kroeker
2c7d4a7766 Delete cchkaa.f 2021-05-14 19:53:38 +02:00
Martin Kroeker
eef1c42f03 Convert ?chkaa to use dynamic allocation for the larger arrays 2021-05-14 19:53:03 +02:00
Martin Kroeker
73f637e584 Support compilation with pre-C99 versions of MSVC 2021-05-14 15:08:12 +02:00
Martin Kroeker
8b90e5f202 Drop redundant inclusion of complex.h 2021-05-14 15:06:44 +02:00
Martin Kroeker
bd60fb6ffc filter out -mavx flag on zgemm kernels as it can cause problems with older gcc 2021-05-13 23:05:00 +02:00
Martin Kroeker
37ea8702ee Merge pull request #3192 from damonyu1989/develop
Update the intrinsic api to the offical name.
2021-05-11 16:00:45 +02:00
Martin Kroeker
ec7d6c02bc Add an Android crossbuild on OSX to Azure CI (#3224)
* Add an Android crossbuild on OSX
2021-05-10 08:02:01 +02:00
Martin Kroeker
c90c23e78f Merge pull request #3223 from martin-frbg/develop
Use percent instead of ampersand as placeholder for substitutions
2021-05-07 08:51:45 +02:00
Martin Kroeker
bda8820da7 Use percent instead of ampersand as placeholder for substitutions 2021-05-06 20:20:08 +02:00
Martin Kroeker
c0ca63ea46 Fix missing conditionals for non-SKX kernels 2021-05-05 14:55:36 +02:00
Martin Kroeker
f497bb949b Merge pull request #3219 from austinpagan/Gemm.ErrorFix
Add error message token for SBGEMM in gemm.c
2021-05-05 14:30:41 +02:00
Martin Kroeker
f86b1bc3da Merge pull request #3220 from drhpc/drhpc-fixup
Delete lapack_wrappers.c.orig
2021-05-05 14:30:24 +02:00
drhpc
206e03fdac Delete lapack_wrappers.c.orig
This looks like a leftover from patching and confuses further patching;-)
2021-05-04 21:02:07 +02:00
Gordon Fossum
8b599836db Add error message token for SBGEMM in gemm.c 2021-05-04 13:55:02 -05:00
Martin Kroeker
9721b57ecf Update version to 0.3.15.dev 2021-05-03 00:01:08 +02:00
Martin Kroeker
380f955078 Update version to 0.3.15.dev 2021-05-03 00:00:29 +02:00
Martin Kroeker
49d18e65e3 Merge pull request #3217 from xianyi/release-0.3.0
merge 0.3.15 back into develop to copy tag
2021-05-02 23:59:55 +02:00
Martin Kroeker
904f9a267d Update version to 0.3.15 2021-05-02 23:50:22 +02:00
Martin Kroeker
4c033730bb Update version to 0.3.15 2021-05-02 23:49:49 +02:00
Martin Kroeker
65502c6af6 Merge pull request #3216 from xianyi/develop
Update from develop for 0.3.15 release
2021-05-02 23:48:28 +02:00
Martin Kroeker
f71627fa2e Merge pull request #3215 from martin-frbg/cl0315
Update Changelog for 0.3.15
2021-05-02 23:47:24 +02:00
Martin Kroeker
d8d7bd33cb Update Changelog for 0.3.15 2021-05-02 23:46:55 +02:00
Martin Kroeker
e72420e8c5 Merge pull request #3214 from martin-frbg/lapack-3.9.1hrt
Add new Householder Reconstruction functions from LAPACK 3.9.1
2021-05-02 23:40:03 +02:00
Martin Kroeker
d00709e016 Add files via upload 2021-05-02 20:47:58 +02:00
Martin Kroeker
d444344497 Add LAPACKE interfaces for the new Householder Reconstruction functions from 3.9.1 2021-05-02 19:57:47 +02:00
Martin Kroeker
fb7308b9b5 Add entries for the new Householder Reconstruction functions from 3.9.1 2021-05-02 19:56:11 +02:00
Martin Kroeker
db50b24a4a Add entries for the new Householder Reconstruction functions from 3.9.1 2021-05-02 19:55:15 +02:00
Martin Kroeker
88b70fba3e Add new tests for Householder reconstruction functions from 3.9.1 2021-05-02 19:28:21 +02:00
Martin Kroeker
4c1d47098b Add new files for Householder reconstruction functions from 3.9.1 2021-05-02 19:25:43 +02:00
Martin Kroeker
40000d1f64 Add entries for Householder reconstruction functions from 3.9.1 2021-05-02 19:21:59 +02:00
Martin Kroeker
dc3664993c Merge pull request #26 from xianyi/develop
rebase
2021-05-02 19:19:28 +02:00
Martin Kroeker
b8232c9054 Merge pull request #3213 from martin-frbg/lapack382
Avoid allocating the transposed triangular matrix in LAPACKE_xlantr_work (Reference-LAPACK 382)
2021-05-02 18:45:15 +02:00
Martin Kroeker
114bbbc6d7 Merge pull request #3212 from martin-frbg/lapack463
Initialize X and Y to zero for N=0 in xGGGLM (Reference-LAPACK PR463)
2021-05-02 18:44:59 +02:00
Martin Kroeker
b67a92c19f Merge pull request #3211 from martin-frbg/lapack471
Handle norm NaN value in xGESDD (Reference LAPACK PR471)
2021-05-02 18:44:29 +02:00
Martin Kroeker
4bf00da8fb Avoid allocating the transposed triangular matrix (Reference-LAPACK PR382) 2021-05-02 12:18:17 +02:00
Martin Kroeker
c26780d451 Initialize X and Y to zero for N=0 (Reference-LAPACK PR463) 2021-05-02 11:40:56 +02:00
Martin Kroeker
d77d9bc920 Handle norm NaN value (Reference LAPACK PR471) 2021-05-02 11:24:50 +02:00
Martin Kroeker
37d3e2bd94 Merge pull request #3210 from martin-frbg/lapack502
Fix possible division by zero in LAPACK xTGSJA (Reference-LAPACK PR502)
2021-05-02 09:02:11 +02:00
Martin Kroeker
de8656769c Fix possible division by zero in xTGSJA (Reference-LAPACK PR502) 2021-05-01 21:31:13 +02:00
Martin Kroeker
d43e07198d Merge pull request #3208 from martin-frbg/lapack534
Apply MKL team fixes to the LAPACKE interfaces (Reference-LAPACK PR 534)
2021-05-01 20:18:29 +02:00
Martin Kroeker
da16764c7a Merge pull request #3209 from martin-frbg/issue3160
Add casts to prevent overflow of intermediate results
2021-05-01 20:08:24 +02:00
Martin Kroeker
98ebc8ac59 Add casts to prevent overflow of intermediate result 2021-05-01 14:48:19 +02:00
Martin Kroeker
904b221f03 Add cast to prevent overflow of intermediate result 2021-05-01 14:47:22 +02:00
Martin Kroeker
5cc35abc3d Apply MKL team fixes to the LAPACKE interfaces (Reference-LAPACK PR 534)
Removed spurious checks for INFO in xLACPY,xLASET after routines not returning any,and redundant requirements for ldvt in xGESVD_WORK
2021-05-01 13:22:10 +02:00
Martin Kroeker
254774f5a6 Add const qualifiers 2021-05-01 13:10:16 +02:00
Martin Kroeker
ae9cdee753 Merge pull request #3207 from hjl-tools/hjl/cet/develop
x86: Enable Intel CET
2021-05-01 12:42:54 +02:00
H.J. Lu
53ee0b76bb x86: Enable Intel CET
When Intel CET is enabled, we need to include <cet.h> in assembly codes
to mark Intel CET support and place _CET_ENDBR at the function entry.
2021-04-30 19:45:39 -07:00
Martin Kroeker
dc6b04c375 Merge pull request #3206 from martin-frbg/lapack480535
Import packing improvements to LAPACK xLAQR from Reference-LAPACK (PR 480+535)
2021-04-30 21:42:44 +02:00
pnp
3d4ccd2a13 fix for build error 2021-04-30 12:25:33 -04:00
pnp
c59652f0ce optimize on sgemv_n for small n 2021-04-30 12:14:58 -04:00
Martin Kroeker
87d2e314db Import packing improvements in LAPACK xLAQR from Reference-LAPACK PR 480+535 2021-04-30 13:50:55 +02:00
Martin Kroeker
3a30c12019 Merge pull request #25 from xianyi/develop
rebase
2021-04-30 13:47:17 +02:00
Martin Kroeker
c9a82f54d1 Merge pull request #3204 from martin-frbg/lapack506
Correct INFO value returned by SLASQ2/DLASQ2 (Reference-LAPACK 506)
2021-04-30 13:25:48 +02:00
Martin Kroeker
444cb78be5 correct INFO value (Reference-LAPACK 506) 2021-04-30 09:26:54 +02:00
Martin Kroeker
171c20e3b6 Merge pull request #3202 from martin-frbg/issue3201
Fix division by zero in the non-x86 codepath of C/ZROTG
2021-04-29 18:58:27 +02:00
Martin Kroeker
c5fb91f1bc Fix division by zero in the non-x86 codepath 2021-04-29 09:47:18 +02:00
Martin Kroeker
9a36a283d3 Merge pull request #3199 from martin-frbg/lapack537
Add LAPACKE fixes from Reference-LAPACK PR 537
2021-04-29 05:39:50 +02:00
Martin Kroeker
7e35d25ea0 Merge pull request #3198 from martin-frbg/lapack539
Apply fixes from Reference-LAPACK PR468 and 539 for array declarations in ?ORGBR/?UNGBR
2021-04-29 05:39:35 +02:00
Martin Kroeker
3704f5e5b0 Add missing break statements in the ?lascl functions 2021-04-28 20:56:55 +02:00
Martin Kroeker
6b76066632 Add const qualifiers 2021-04-28 20:55:37 +02:00
Martin Kroeker
2b01132515 Clean up misdeclaration of the dummy stand-in for A in ?ORGBR/?UNGBR workspace queries (Reference-LAPACK PR 468 and 530) 2021-04-28 19:20:08 +02:00
Martin Kroeker
8e95a1e18d Merge pull request #3195 from martin-frbg/lapack536
Apply lapack-testing fix from Reference-LAPACK PR536
2021-04-28 18:17:25 +02:00
Wangyang Guo
aa7b3dc3db GEMM: skylake: improve the performance when m is small 2021-04-28 13:56:06 +00:00
Martin Kroeker
13a29d13fd Apply lapack-testing fix from Reference-LAPACK PR536
fixes changing back from a single OMP thread for error exit testing to the originally requested number of threads for computational tests
2021-04-27 15:48:22 +02:00
Martin Kroeker
a6c2cb8417 Merge pull request #3193 from martin-frbg/lapack538
Apply lapack-testing fixes from Reference-LAPACK PR538
2021-04-27 15:40:51 +02:00
Martin Kroeker
d511a7bb4f Merge pull request #3191 from martin-frbg/issue3188
Delay creation of the (soft)link until after the library has been built
2021-04-27 13:35:16 +02:00
Martin Kroeker
3526ff2507 Apply fixes from Reference-LAPACK PR538 2021-04-27 12:52:49 +02:00
Martin Kroeker
adcfe7b789 Merge pull request #3190 from martin-frbg/issue3128-2
Replace spurious AVX512 requirement in the Haswell drot microkernel with an AVX2/FMA3 guard
2021-04-27 06:36:28 +02:00
damonyu
ceb44bef14 update the intrinsic api to the offical name. 2021-04-27 11:12:29 +08:00
damonyu1989
ed473267df Merge pull request #1 from xianyi/develop
update
2021-04-27 10:53:59 +08:00
Martin Kroeker
0608bc5d82 delay creation of the softlink until after the library has been created 2021-04-26 22:32:23 +02:00
Martin Kroeker
3d511f0e66 replace spurious avx512 requirement with fma check 2021-04-26 21:55:30 +02:00
Martin Kroeker
0b8a436af9 Add mixed clang/ifort build on OSX to Azure CI (#3185)
* Add mixed clang/ifort build on OSX to the Azure CI config based on https://github.com/oneapi-src/oneapi-ci
(and remove debugging tools from the clang+gfortran job)

* Remove extraneous libgfortran dependency of ifort builds

* remove FEXTRALIB from link line of shared library as ifort keeps track of dependencies (and they are different for a .dylib than what f_check got for an executable)
2021-04-22 02:11:20 +02:00
Martin Kroeker
352efdd13a Merge pull request #24 from xianyi/develop
rebase
2021-04-20 21:30:28 +02:00
Martin Kroeker
4855af02a3 Merge pull request #3184 from martin-frbg/ctestfix
Fix obscure ctest crashes on OSX and add OSX builds to Azure CI
2021-04-20 07:31:07 +02:00
Martin Kroeker
94a5a1f0f1 Add OSX build variations to Azure CI 2021-04-19 22:27:08 +02:00
Martin Kroeker
751d127d7c Include cblas_test.h to achieve int/long size change with INTERFACE64 2021-04-19 22:26:34 +02:00
Martin Kroeker
fc101b67e5 Merge pull request #23 from xianyi/develop
rebase
2021-04-19 22:24:12 +02:00
Martin Kroeker
b0239a05fd Merge pull request #3183 from martin-frbg/2715-x
Restore __volatile__ keyword in ARM64 DYNAMIC_ARCH detection mechanism
2021-04-16 14:52:12 +02:00
Martin Kroeker
623d580b4c Restore __volatile__ keyword 2021-04-16 10:27:32 +02:00
Martin Kroeker
974acb39ff Merge pull request #3181 from RajalakshmiSR/dgemmp10vp
POWER10: Improve dgemm performance
2021-04-14 22:43:02 +02:00
Rajalakshmi Srinivasaraghavan
2379abaa5e POWER10: Improve dgemm performance
This patch uses vector pair pointer for input load operation
which helps to generate power10 lxvp instructions.
2021-04-13 22:30:06 -05:00
Martin Kroeker
3caf781d7c Merge pull request #3179 from RajalakshmiSR/zgemvp10
POWER10: Optimized zgemv
2021-04-11 10:01:09 +02:00
Rajalakshmi Srinivasaraghavan
55bb9f639a POWER10: Optimized zgemv
This patch makes use of Matrix-Multiply Assist (MMA)
feature introduced in POWER ISA v3.1 for zgemv_n and zgemv_t.
2021-04-10 19:00:24 -05:00
Martin Kroeker
0dba04bb58 Merge pull request #3178 from martin-frbg/fix2864
Fix unwanted fallback to implicit typing in slanv2/dlanv2
2021-04-09 13:38:05 +02:00
Martin Kroeker
e96f5e3c65 Fix implicit typing of new variable TWO 2021-04-09 10:04:15 +02:00
Martin Kroeker
558724e99f Fix implicit typing of new variable TWO 2021-04-09 10:03:31 +02:00
Martin Kroeker
067c96a873 Merge pull request #3177 from martin-frbg/issue3176
Use "old" compute(24) function with clang due to register limitations
2021-04-07 08:22:42 +02:00
Martin Kroeker
4b380c0b40 Merge pull request #3175 from LYP951018/develop
Pass NO_AVX512 macro def when `DYNAMIC_ARCH` is enabled
2021-04-07 08:22:28 +02:00
Martin Kroeker
2dfb24730d Use "old" compute(24) function with clang due to register limitations 2021-04-06 19:58:32 +02:00
刘雨培
725432efaa pass NO_AVX512 macro def 2021-04-07 00:10:41 +08:00
Martin Kroeker
a2216ef19f Merge pull request #3173 from martin-frbg/dyna-sse3
Fix spillover of host-specific build flags into the shared part of x86 DYNAMIC_ARCH builds
2021-04-05 13:39:17 +02:00
Martin Kroeker
5332cbae18 Avoid adding host-specific cpuflags to the common part of DYNAMIC_ARCH builds 2021-04-04 23:12:17 +02:00
Martin Kroeker
209b026e46 Merge pull request #3172 from martin-frbg/lapack477-final
Copy missing fixes from the final revision of Reference-LAPACK PR477
2021-04-04 20:19:09 +02:00
Martin Kroeker
1ae607beca Update Makefile.x86_64 2021-04-04 12:31:22 +02:00
Martin Kroeker
d393f1923f Fix spillover of host-specific build flags into the shared part of DYNAMIC_ARCH builds with gmake
for #3139
2021-04-03 22:18:15 +02:00
Martin Kroeker
081d5ae971 Fix typo and potentially undefined variables
(copies fixes made in Reference-LAPACK PR 477 after the initial cherrypick)
2021-04-03 22:11:14 +02:00
Martin Kroeker
0492f0f3f9 Merge pull request #22 from xianyi/develop
rebase
2021-04-03 21:58:36 +02:00
Martin Kroeker
147e0a75fd Merge pull request #3170 from CodesWithWolves/sgemm_tcopy_16-invalid-read
Remove Unnecessary/Erroneous Adds/Reads In sgemm_tcopy_16.S COPY1x8 Macro
2021-04-03 19:49:47 +02:00
Martin Kroeker
ee068af843 Merge pull request #3171 from RajalakshmiSR/BE_p10
POWER10:  Adding check for little endian
2021-04-01 21:20:24 +02:00
Rajalakshmi Srinivasaraghavan
2dbcddd83d POWER10: Adding check for little endian
This patch makes sure that recent POWER10 patches are used
only for little endian.
2021-03-31 21:32:42 -05:00
CodesWithWolves
d2bda3b56a Remove Unnecessary/Erroneous Reads In sgemm_tcopy_16.S COPY1x8 Macro
There appears to have been some code leak when copying from the COPY2x8
macro above where we're reading 8 bytes into d4-d7 directly after
reading 4 bytes into s4-s7. These 32 bytes in d4-7 are unused and can
possibly overrun the boundary of allocated memory -- Valgrind detected
this which is what dragged my attention to it for a 128,1 copy.

Additionally, there is no need to update the addresses stored in A0-A7
as the only possible paths after running this macro will overwrite A0-7
if looping to the next 8 rows, or overwrite A0-3 if moving to 4 rows --
in which case A4-7 are unused.
2021-03-31 15:44:25 -04:00
Martin Kroeker
903fd85c85 Merge pull request #3167 from xianyi/fix3126
Fix compilation of the benchmarks on older OSX versions
2021-03-27 12:40:42 +01:00
399 changed files with 40980 additions and 4791 deletions

View File

@@ -43,11 +43,6 @@ jobs:
- name: Update Homebrew
if: github.event_name != 'pull_request'
run: brew update || true
- name: unlink installed gcc to allow updating
run: |
brew unlink gcc@8
brew unlink gcc@9
- name: Install prerequisites
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas

View File

@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 14.dev)
set(OpenBLAS_PATCH_VERSION 17.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions

View File

@@ -194,3 +194,6 @@ In chronological order:
* PingTouGe Semiconductor Co., Ltd.
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
* River Dillon <oss@outerpassage.net>
* [2021-07-10] fix compilation with musl libc

View File

@@ -1,4 +1,117 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.17
15-Jul-2021
common:
- reverted the optimization of SGEMV_N/DGEMV_N for small input sizes
and consecutive arguments as it led to stack overflows on x86_64
with some operating systems (notably OSX and Windows)
x86_64:
- reverted the performance patch for SGEMV_T on AVX512 as it caused
wrong results in some applications
SPARC:
- fixed compilation with compilers other than gcc
====================================================================
Version 0.3.16
11-Jul-2021
common:
- drastically reduced the stack size requirements for running the LAPACK
testsuite (Reference-LAPACK PR 553)
- fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK
PR 564)
- expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode
- improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N
and DGEMV_N, for small input sizes and consecutive arguments
- improved performance of xGETRF, xPORTF and xPOTRI for small input sizes
by disabling multithreading
- fixed installing with BSD versions of the "install" utility
RISCV:
- fixed the implementation of xIMIN
- improved the performance of DSDOT
- fixed linking of the tests on C910V with current vendor gcc
POWER:
- fixed SBGEMM computation for some odd value inputs
- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5
x86_64:
- improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus
- worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc
versions
- fixed compilation with MS Visual Studio versions older than 2017
- fixed macro name collision with winnt.h from the latest Win10 SDK
- added cpu type autodetection for Intel Ice Lake SP
- fixed cpu type autodetection for Intel Tiger Lake
- added cpu type autodetection for recent Centaur/Zhaoxin models
- fixed compilation with musl libc
ARM64:
- fixed compilation with gcc/gfortran on the Apple M1
- fixed linking of the tests on FreeBSD
- fixed missing restore of a register in the recently rewritten DNRM2 kernel
for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g.
DGEEV
- added compiler optimization flags for the EMAG8180
- added initial support for Cortex A55
ARM:
- fixed linking of the tests on FreeBSD
====================================================================
Version 0.3.15
2-May-2021
common:
- imported improvements and bugfixes from Reference-LAPACK 3.9.1
- imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
- fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
- fixed a sequence problem in the generation of softlinks to the library in GMAKE
RISC V:
- fixed compilation on RISCV (missing entry in getarch)
- fixed a potential division by zero in CROTG and ZROTG
POWER:
- fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
- improved CGEMM, DGEMM and ZGEMM performance on POWER10
- added an optimized ZGEMV kernel for POWER10
- fixed a potential division by zero in CROTG and ZROTG
x86_64:
- added support for Intel Control-flow Enforcement Technology (CET)
- reverted the DOMATCOPY_RT code to the generic C version
- fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
- fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
- added support for compilation of the benchmarks on older OSX versions
- fix propagation of the NO_AVX512 option in CMAKE builds
- fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
- fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
- corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
ARM:
- fixed a potential division by zero in CROTG and ZROTG
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
ARM64:
- fixed spurious reads outside the array in the SGEMM tcopy macro
- fixed a potential division by zero in CROTG and ZROTG
- fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
MIPS
- fixed a potential division by zero in CROTG and ZROTG
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
MIPS64:
- fixed a potential division by zero in CROTG and ZROTG
SPARC:
- fixed a potential division by zero in CROTG and ZROTG
====================================================================
Version 0.3.14
17-Mar-2021

View File

@@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
$(error OpenBLAS: neither static nor shared are enabled.)
endif
endif
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@for d in $(SUBDIRS) ; \
do if test -d $$d; then \
$(MAKE) -C $$d $(@F) || exit 1 ; \
@@ -196,6 +195,7 @@ endif
ifdef USE_THREAD
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
endif
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
@touch lib.grd
prof : prof_blas prof_lapack

View File

@@ -1,4 +1,15 @@
ifneq ($(C_COMPILER), PGI)
ifneq ($(GCCVERSIONGT4), 1)
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a
endif
else
ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
@@ -57,6 +68,28 @@ endif
endif
endif
# Use a53 tunings because a55 is only available in GCC>=8.1
ifeq ($(CORE), CORTEXA55)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq ($(GCCVERSIONGTEQ8), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
endif
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
endif
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif
endif
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
ifneq ($(F_COMPILER), NAG)
@@ -107,4 +140,16 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), EMAG8180)
CCOMMON_OPT += -march=armv8-a -mtune=emag
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=emag
endif
endif
endif
endif
endif

View File

@@ -74,17 +74,17 @@ endif
ifneq ($(OSNAME), AIX)
ifndef NO_LAPACKE
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
endif
#for install static library
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
@@ -92,7 +92,7 @@ endif
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)

3
Makefile.loongarch64 Normal file
View File

@@ -0,0 +1,3 @@
ifdef BINARY64
else
endif

View File

@@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.14.dev
VERSION = 0.3.17.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
@@ -380,6 +381,12 @@ ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
ifeq ($(OSNAME), FreeBSD)
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
EXTRALIB += -lm
endif
endif
ifeq ($(OSNAME), WINNT)
NEED_PIC = 0
NO_EXPRECISION = 1
@@ -619,6 +626,7 @@ DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += NEOVERSEN1
DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99
@@ -772,6 +780,11 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
#
# C Compiler dependent settings
@@ -842,6 +855,13 @@ ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
ifeq ($(CORE), LOONGSONG3R5)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
endif
endif
endif
ifndef BINARY_DEFINED

View File

@@ -1,10 +1,21 @@
# COMPILER_PREFIX = mingw32-
ifdef HAVE_SSE
CCOMMON_OPT += -msse
FCOMMON_OPT += -msse
ifneq ($(DYNAMIC_ARCH),1)
ADD_CPUFLAGS = 1
else
ifdef TARGET_CORE
ADD_CPUFLAGS = 1
endif
endif
ifdef ADD_CPUFLAGS
ifdef HAVE_SSE
CCOMMON_OPT += -msse
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -msse
endif
endif
endif
ifeq ($(OSNAME), Interix)
ARFLAGS = -m x86

View File

@@ -8,6 +8,16 @@ endif
endif
endif
ifneq ($(DYNAMIC_ARCH),1)
ADD_CPUFLAGS = 1
else
ifdef TARGET_CORE
ADD_CPUFLAGS = 1
endif
endif
ifdef ADD_CPUFLAGS
ifdef HAVE_SSE3
CCOMMON_OPT += -msse3
ifneq ($(F_COMPILER), NAG)
@@ -44,7 +54,6 @@ endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
@@ -62,10 +71,8 @@ endif
endif
endif
endif
endif
ifeq ($(CORE), COOPERLAKE)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1
@@ -88,7 +95,6 @@ endif
endif
endif
endif
endif
ifdef HAVE_AVX2
ifndef NO_AVX2
@@ -120,6 +126,7 @@ endif
endif
endif
endif
ifeq ($(OSNAME), Interix)

View File

@@ -27,7 +27,7 @@ We provide official binary packages for the following platform:
* Windows x86/x86_64
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
## Installation from Source

View File

@@ -92,6 +92,7 @@ CORTEXA57
CORTEXA72
CORTEXA73
NEOVERSEN1
CORTEXA55
EMAG8180
FALKOR
THUNDERX
@@ -109,3 +110,5 @@ Z14
RISCV64_GENERIC
C910V
11.LOONGARCH64:
LOONGSON3R5

View File

@@ -47,6 +47,7 @@ environment:
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"

View File

@@ -4,7 +4,15 @@ trigger:
branches:
include:
- develop
resources:
containers:
- container: oneapi-hpckit
image: intel/oneapi-hpckit:latest
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
- container: oneapi-basekit
image: intel/oneapi-basekit:latest
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
jobs:
# manylinux1 is useful to test because the
# standard Docker container uses an old version
@@ -74,7 +82,86 @@ jobs:
steps:
- script: |
brew update
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 FC=gfortran-10
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
- job: OSX_GCC_Nothreads
pool:
vmImage: 'macOS-10.15'
steps:
- script: |
brew update
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
- job: OSX_OpenMP_Clang
pool:
vmImage: 'macOS-10.15'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
steps:
- script: |
brew update
brew install llvm libomp
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
- job: OSX_Ifort_Clang
pool:
vmImage: 'macOS-10.15'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
LIBRARY_PATH: /usr/local/opt/llvm/lib
MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
steps:
- script: |
brew update
brew install llvm libomp
sudo mkdir -p /opt/intel
sudo chown $USER /opt/intel
displayName: prepare for cache restore
- task: Cache@2
inputs:
path: /opt/intel/oneapi
key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
cacheHitVar: CACHE_RESTORED
- script: |
curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
hdiutil attach webimage.dmg
sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
installer_exit_code=$?
hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
exit $installer_exit_code
displayName: install
condition: ne(variables.CACHE_RESTORED, 'true')
- script: |
source /opt/intel/oneapi/setvars.sh
make CC=/usr/local/opt/llvm/bin/clang FC=ifort
- job: OSX_NDK_ARMV7
pool:
vmImage: 'macOS-10.15'
steps:
- script: |
brew update
brew install --cask android-ndk
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
- job: ALPINE_MUSL
pool:
vmImage: 'ubuntu-latest'
steps:
- script: |
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
alpine make DYNAMIC_ARCH=1 BINARY=64
alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install
alpine ls -l mytestdir/include
alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c
alpine echo "#include <openblas_config.h>" >>test_install.c
alpine echo "int main(){" >> test_install.c
alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c
alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install

View File

@@ -72,13 +72,17 @@ int main(int argc, char *argv[]){
FLOAT *a,*work;
FLOAT wkopt[4];
blasint *ipiv;
blasint m, i, j, info,lwork;
blasint m, i, j, l, info,lwork;
int from = 1;
int to = 200;
int step = 1;
int loops = 1;
double time1;
double time1,timeg;
char *p;
char btest = 'I';
argc--;argv++;
@@ -86,6 +90,9 @@ int main(int argc, char *argv[]){
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
@@ -124,32 +131,41 @@ int main(int argc, char *argv[]){
fprintf(stderr, " SIZE FLops Time Lwork\n");
for(m = from; m <= to; m += step){
timeg = 0.;
fprintf(stderr, " %6d : ", (int)m);
GETRF (&m, &m, a, &m, ipiv, &info);
for (l = 0; l < loops; l++) {
if (btest == 'F') begin();
GETRF (&m, &m, a, &m, ipiv, &info);
if (btest == 'F') {
end();
timeg += getsec();
}
if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1);
}
begin();
if (btest == 'I') begin();
lwork = -1;
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
lwork = (blasint)wkopt[0];
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
end();
if (btest == 'I') end();
if (info) {
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
exit(1);
}
time1 = getsec();
if (btest == 'I')
timeg += getsec();
} // loops
time1 = timeg/(double)loops;
fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n",
COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);

View File

@@ -72,17 +72,21 @@ int main(int argc, char *argv[]){
FLOAT *a, *b;
blasint *ipiv;
blasint m, i, j, info;
blasint m, i, j, l, info;
blasint unit = 1;
int from = 1;
int to = 200;
int step = 1;
int loops = 1;
FLOAT maxerr;
double time1, time2;
double time1, time2, timeg1,timeg2;
char *p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
@@ -110,9 +114,9 @@ int main(int argc, char *argv[]){
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
for(m = from; m <= to; m += step){
timeg1 = timeg2 = 0.;
fprintf(stderr, " %6d : ", (int)m);
for (l = 0; l < loops; l++) {
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
@@ -138,7 +142,7 @@ int main(int argc, char *argv[]){
exit(1);
}
time1 = getsec();
timeg1 += getsec();
begin();
@@ -151,8 +155,10 @@ int main(int argc, char *argv[]){
exit(1);
}
time2 = getsec();
timeg2 += getsec();
} //loops
time1=timeg1/(double)loops;
time2=timeg2/(double)loops;
maxerr = 0.;
for(i = 0; i < m; i++){

View File

@@ -99,14 +99,15 @@ int main(int argc, char *argv[]){
char *p;
char btest = 'F';
blasint m, i, j, info, uplos=0;
double flops;
blasint m, i, j, l, info, uplos=0;
double flops = 0.;
int from = 1;
int to = 200;
int step = 1;
int loops = 1;
double time1;
double time1, timeg;
argc--;argv++;
@@ -119,6 +120,8 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
@@ -129,19 +132,21 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
for(m = from; m <= to; m += step){
for(m = from; m <= to; m += step){
timeg=0.;
for (l = 0; l < loops; l++) {
#ifndef COMPLEX
if (uplos & 1) {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
}
} else {
for (j = 0; j < m; j++) {
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
}
}
@@ -192,8 +197,8 @@ int main(int argc, char *argv[]){
exit(1);
}
time1 = getsec();
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
if ( btest == 'F')
timeg += getsec();
if ( btest == 'S' )
{
@@ -214,9 +219,7 @@ int main(int argc, char *argv[]){
fprintf(stderr, "Potrs info = %d\n", info);
exit(1);
}
time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
timeg += getsec();
}
if ( btest == 'I' )
@@ -232,11 +235,17 @@ int main(int argc, char *argv[]){
fprintf(stderr, "Potri info = %d\n", info);
exit(1);
}
time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
timeg += getsec();
}
} // loops
time1 = timeg/(double)loops;
if ( btest == 'F')
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
if ( btest == 'S')
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
if ( btest == 'I')
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);

View File

@@ -46,14 +46,17 @@ int main(int argc, char *argv[]){
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
blasint m, i, j;
blasint m, i, j, l;
blasint inc_x= 1;
blasint inc_y= 1;
int from = 1;
int to = 200;
int step = 1;
int loops = 1;
double time1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
double time1,timeg;
argc--;argv++;
@@ -85,8 +88,9 @@ int main(int argc, char *argv[]){
for(m = from; m <= to; m += step)
{
timeg = 0.;
fprintf(stderr, " %6d : ", (int)m);
for (l = 0; l < loops; l++) {
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
@@ -107,8 +111,10 @@ int main(int argc, char *argv[]){
end();
time1 = getsec();
timeg += getsec();
} // loops
time1 = timeg/(double)loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);

View File

@@ -56,17 +56,20 @@ int main(int argc, char *argv[]){
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
blasint m, i, j, l;
int from = 1;
int to = 200;
int step = 1;
int loops = 1;
double time1;
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
double time1,timeg;
argc--;argv++;
@@ -95,9 +98,12 @@ int main(int argc, char *argv[]){
for(m = from; m <= to; m += step)
{
timeg = 0.;
fprintf(stderr, " %6d : ", (int)m);
for(l = 0; l < loops; l++) {
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
@@ -111,8 +117,10 @@ int main(int argc, char *argv[]){
end();
time1 = getsec();
timeg += getsec();
} //loops
time1 = timeg / (double)loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);

53
c_check
View File

@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/);
$os = Android if ($data =~ /OS_ANDROID/);
$os = Haiku if ($data =~ /OS_HAIKU/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$defined = 0;
@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
$binary = 64;
}
if ($architecture eq "loongarch64") {
$defined = 1;
$binary = 64;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
}
}
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = power if ($data =~ /ARCH_POWER/);
$architecture = mips if ($data =~ /ARCH_MIPS/);
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
$architecture = sparc if ($data =~ /ARCH_SPARC/);
$architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);

View File

@@ -44,7 +44,7 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
endif ()

View File

@@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH)
if (HAVE_AVX)
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
endif ()
if (HAVE_FMA3)
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
endif ()
# if (HAVE_FMA3)
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
#endif ()
if (HAVE_SSE)
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
endif ()

View File

@@ -66,7 +66,7 @@ set(SLASRC
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
slarrv.f slartv.f
slarz.f slarzb.f slarzt.f slasy2.f
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
@@ -112,14 +112,14 @@ set(SLASRC
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
sgelqt.f sgelqt3.f sgemlqt.f
sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
sgelq.f slaswlq.f slamswlq.f sgemlq.f
stplqt.f stplqt2.f stpmlqt.f
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -171,7 +171,7 @@ set(CLASRC
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
clarf.f clarfb.f clarfg.f clarfgp.f clarft.f
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
@@ -209,14 +209,14 @@ set(CLASRC
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
cgelqt.f cgelqt3.f cgemlqt.f
cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
cgelq.f claswlq.f clamswlq.f cgemlq.f
ctplqt.f ctplqt2.f ctpmlqt.f
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cunhr_col.f )
cungtsqr.f cungtsqr_row.f cunhr_col.f )
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -253,7 +253,7 @@ set(DLASRC
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
dlargv.f dlarrv.f dlartv.f
dlarz.f dlarzb.f dlarzt.f dlasy2.f
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
@@ -300,14 +300,14 @@ set(DLASRC
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
dgelqt.f dgelqt3.f dgemlqt.f
dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
dtplqt.f dtplqt2.f dtpmlqt.f
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -360,7 +360,7 @@ set(ZLASRC
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
zlarcm.f zlarf.f zlarfb.f
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
zlarfg.f zlarfgp.f zlarft.f
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
@@ -402,13 +402,13 @@ set(ZLASRC
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
ztplqt.f ztplqt2.f ztpmlqt.f
zgelqt.f zgelqt3.f zgemlqt.f
zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zunhr_col.f)
zungtsqr.f zungtsqr_row.f zunhr_col.f)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f

View File

@@ -114,6 +114,8 @@ set(CSRC
lapacke_cgetrs_work.c
lapacke_cgetsls.c
lapacke_cgetsls_work.c
lapacke_cgetsqrhrt.c
lapacke_cgetsqrhrt_work.c
lapacke_cggbak.c
lapacke_cggbak_work.c
lapacke_cggbal.c
@@ -590,6 +592,8 @@ set(CSRC
lapacke_cungrq_work.c
lapacke_cungtr.c
lapacke_cungtr_work.c
lapacke_cungtsqr_row.c
lapacke_cungtsqr_row_work.c
lapacke_cunmbr.c
lapacke_cunmbr_work.c
lapacke_cunmhr.c
@@ -735,6 +739,8 @@ set(DSRC
lapacke_dgetrs_work.c
lapacke_dgetsls.c
lapacke_dgetsls_work.c
lapacke_dgetsqrhrt.c
lapacke_dgetsqrhrt_work.c
lapacke_dggbak.c
lapacke_dggbak_work.c
lapacke_dggbal.c
@@ -862,6 +868,8 @@ set(DSRC
lapacke_dorgrq_work.c
lapacke_dorgtr.c
lapacke_dorgtr_work.c
lapacke_dorgtsqr_row.c
lapacke_dorgtsqr_row_work.c
lapacke_dormbr.c
lapacke_dormbr_work.c
lapacke_dormhr.c
@@ -1309,6 +1317,8 @@ set(SSRC
lapacke_sgetrs_work.c
lapacke_sgetsls.c
lapacke_sgetsls_work.c
lapacke_sgetsqrhrt.c
lapacke_sgetsqrhrt_work.c
lapacke_sggbak.c
lapacke_sggbak_work.c
lapacke_sggbal.c
@@ -1435,6 +1445,8 @@ set(SSRC
lapacke_sorgrq_work.c
lapacke_sorgtr.c
lapacke_sorgtr_work.c
lapacke_sorgtsqr_row.c
lapacke_sorgtsqr_row_work.c
lapacke_sormbr.c
lapacke_sormbr_work.c
lapacke_sormhr.c
@@ -1877,6 +1889,8 @@ set(ZSRC
lapacke_zgetrs_work.c
lapacke_zgetsls.c
lapacke_zgetsls_work.c
lapacke_zgetsqrhrt.c
lapacke_zgetsqrhrt_work.c
lapacke_zggbak.c
lapacke_zggbak_work.c
lapacke_zggbal.c
@@ -2351,6 +2365,8 @@ set(ZSRC
lapacke_zungrq_work.c
lapacke_zungtr.c
lapacke_zungtr_work.c
lapacke_zungtsqr_row.c
lapacke_zungtsqr_row_work.c
lapacke_zunmbr.c
lapacke_zunmbr_work.c
lapacke_zunmhr.c

View File

@@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_CODE_SIZE\t32768\n"
"#define L1_CODE_LINESIZE\t64\n"

View File

@@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
set(TARGET "ARMV7")
endif ()
endif ()
@@ -186,11 +186,11 @@ if (DEFINED TARGET)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
endif()
if (DEFINED HAVE_FMA3)
if (NOT NO_AVX2)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
endif()
endif()
# if (DEFINED HAVE_FMA3)
# if (NOT NO_AVX2)
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
# endif()
# endif()
if (DEFINED HAVE_SSE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
endif()
@@ -299,6 +299,10 @@ if (NO_AVX2)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
endif ()
if (NO_AVX512)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif ()
if (USE_THREAD)
# USE_SIMPLE_THREADED_LEVEL3 = 1
# NO_AFFINITY = 1

View File

@@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in)
# now add the object and set the defines
set(obj_defines ${defines_in})
list(FIND obj_defines "RC" def_idx)
if (${def_idx} GREATER -1)
# list(REMOVE_AT ${obj_defines} ${def_idx})
list (REMOVE_ITEM obj_defines "RC")
list(APPEND obj_defines "RC=RC")
endif ()
list(FIND obj_defines "CR" def_idx)
if (${def_idx} GREATER -1)
# list(REMOVE_AT ${obj_defines} ${def_idx})
list (REMOVE_ITEM obj_defines "CR")
list(APPEND obj_defines "CR=CR")
endif ()
if (use_cblas)
set(obj_name "cblas_${obj_name}")
list(APPEND obj_defines "CBLAS")
@@ -298,7 +311,15 @@ function(GenerateNamedObjects sources_in)
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
file(REMOVE ${new_source_file}.tmp)
list(APPEND SRC_LIST_OUT ${new_source_file})
message (STATUS ${new_source_file})
if (DEFINED HAVE_FMA3)
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
endif ()
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
endif ()
endif ()
endforeach ()
endforeach ()

View File

@@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_alpha.h"
#endif
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
#if __has_include(<cet.h>)
#include <cet.h>
#endif
#endif
#ifndef _CET_ENDBR
#define _CET_ENDBR
#endif
#ifdef ARCH_X86
#include "common_x86.h"
#endif
@@ -440,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_mips.h"
#endif
#ifdef ARCH_RISCV64
#include "common_riscv64.h"
#endif
@@ -461,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_zarch.h"
#endif
#ifdef ARCH_LOONGARCH64
#include "common_loongarch64.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];

View File

@@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);

199
common_loongarch64.h Normal file
View File

@@ -0,0 +1,199 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_LOONGARCH64
#define COMMON_LOONGARCH64
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#ifdef DOUBLE
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#else
#ifdef DOUBLE
#define LD fld.d
#define ST fst.d
#define MADD fmadd.d
#define NMADD fnmadd.d
#define MSUB fmsub.d
#define NMSUB fnmsub.d
#define ADD fadd.d
#define SUB fsub.d
#define MUL fmul.d
#define MOV fmov.d
#define CMOVT fsel
#define MTC movgr2fr.d
#define FABS fabs.d
#define CMPEQ fcmp.ceq.d
#define CMPLE fcmp.cle.d
#define CMPLT fcmp.clt.d
#define NEG fneg.d
#else
#define LD fld.s
#define ST fst.s
#define MADD fmadd.s
#define NMADD fnmadd.s
#define MSUB fmsub.s
#define NMSUB fnmsub.s
#define ADD fadd.s
#define SUB fsub.s
#define MUL fmul.s
#define MOV fmov.s
#define CMOVT fsel
#define MTC movgr2fr.w
#define FABS fabs.s
#define CMPEQ fcmp.ceq.s
#define CMPLE fcmp.cle.s
#define CMPLT fcmp.clt.s
#define NEG fneg.s
#endif /* defined(DOUBLE) */
#if defined(__64BIT__) && defined(USE64BITINT)
#define LDINT ld.d
#define LDARG ld.d
#define SDARG st.d
#elif defined(__64BIT__) && !defined(USE64BITINT)
#define LDINT ld.w
#define LDARG ld.d
#define SDARG st.d
#else
#define LDINT ld.w
#define LDARG ld.w
#define SDARG st.w
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif /* defined(F_INTERFACE) */
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 5 ;\
.globl REALNAME ;\
.type REALNAME, @function ;\
REALNAME: ;\
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",@progbits
#else
#define GNUSTACK
#endif /* defined(__linux__) && defined(__ELF__) */
#define EPILOGUE \
.end REALNAME ;\
GNUSTACK
#define PROFCODE
#define MOVT(dst, src, cc) \
bceqz cc, 1f; \
add.d dst, src, $r0; \
1:
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
#endif /* defined(ASSEMBLER) */
#define SEEK_ADDRESS
#define BUFFER_SIZE ( 32 << 20)
#define PAGESIZE (16UL << 1)
#define FIXED_PAGESIZE (16UL << 10)
#define HUGE_PAGESIZE ( 2 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@@ -2490,7 +2490,8 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|| defined(ARCH_LOONGARCH64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

View File

@@ -340,7 +340,8 @@ REALNAME:
.align 16; \
.globl REALNAME ;\
.type REALNAME, @function; \
REALNAME:
REALNAME: \
_CET_ENDBR
#ifdef PROFILE
#define PROFCODE call mcount

View File

@@ -451,7 +451,8 @@ REALNAME:
.align 512; \
.globl REALNAME ;\
.type REALNAME, @function; \
REALNAME:
REALNAME: \
_CET_ENDBR
#ifdef PROFILE
#define PROFCODE call *mcount@GOTPCREL(%rip)

View File

@@ -54,6 +54,7 @@
#define VENDOR_TRANSMETA 9
#define VENDOR_NSC 10
#define VENDOR_HYGON 11
#define VENDOR_ZHAOXIN 12
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))

View File

@@ -36,6 +36,7 @@ size_t length=sizeof(value);
#define CPU_ARMV8 1
// Arm
#define CPU_CORTEXA53 2
#define CPU_CORTEXA55 14
#define CPU_CORTEXA57 3
#define CPU_CORTEXA72 4
#define CPU_CORTEXA73 5
@@ -67,7 +68,8 @@ static char *cpuname[] = {
"EMAG8180",
"NEOVERSEN1",
"THUNDERX3T110",
"VORTEX"
"VORTEX",
"CORTEXA55"
};
static char *cpuname_lower[] = {
@@ -84,7 +86,8 @@ static char *cpuname_lower[] = {
"emag8180",
"neoversen1",
"thunderx3t110",
"vortex"
"vortex",
"cortexa55"
};
int get_feature(char *search)
@@ -161,6 +164,8 @@ int detect(void)
return CPU_CORTEXA73;
else if (strstr(cpu_part, "0xd0c"))
return CPU_NEOVERSEN1;
else if (strstr(cpu_part, "0xd05"))
return CPU_CORTEXA55;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
@@ -281,6 +286,7 @@ void get_cpuconfig(void)
{
case CPU_CORTEXA53:
case CPU_CORTEXA55:
printf("#define %s\n", cpuname[d]);
// Fall-through
case CPU_ARMV8:

110
cpuid_loongarch64.c Normal file
View File

@@ -0,0 +1,110 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdint.h>
#define CPU_UNKNOWN 0
#define CPU_LOONGSON3R5 1
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
static char *cpuname[] = {
"UNKNOWN",
"LOONGSON3R5"
};
int detect(void) {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5;
else
return CPU_UNKNOWN;
}
char *get_corename(void) {
return cpuname[detect()];
}
void get_architecture(void) {
printf("LOONGARCH64");
}
void get_subarchitecture(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("LOONGSON3R5");
} else {
printf("UNKNOWN");
}
}
void get_subdirname(void) {
printf("loongarch64");
}
void get_cpuconfig(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
} else {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
}
}
void get_libname(void){
if (detect() == CPU_LOONGSON3R5) {
printf("loongson3r5\n");
} else {
printf("loongarch64\n");
}
}

View File

@@ -283,6 +283,7 @@ int get_vendor(void){
if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX;
if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN;
if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN;
if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE;
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
@@ -1066,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
if ((get_vendor() == VENDOR_AMD) ||
(get_vendor() == VENDOR_HYGON) ||
(get_vendor() == VENDOR_CENTAUR)) {
(get_vendor() == VENDOR_CENTAUR) ||
(get_vendor() == VENDOR_ZHAOXIN)) {
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
LDTB.size = 4096;
@@ -1189,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
int get_cpuname(void){
int family, exfamily, model, vendor, exmodel;
int family, exfamily, model, vendor, exmodel, stepping;
if (!have_cpuid()) return CPUTYPE_80386;
@@ -1197,6 +1199,7 @@ int get_cpuname(void){
exfamily = get_cputype(GET_EXFAMILY);
model = get_cputype(GET_MODEL);
exmodel = get_cputype(GET_EXMODEL);
stepping = get_cputype(GET_STEPPING);
vendor = get_vendor();
@@ -1398,6 +1401,17 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 10: // Ice Lake SP
if(support_avx512_bf16())
return CPUTYPE_COOPERLAKE;
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
case 7: // family 6 exmodel 7
@@ -1616,13 +1630,20 @@ int get_cpuname(void){
switch (family) {
case 0x5:
return CPUTYPE_CENTAURC6;
break;
case 0x6:
return CPUTYPE_NANO;
break;
if (model == 0xf && stepping < 0xe)
return CPUTYPE_NANO;
return CPUTYPE_NEHALEM;
default:
if (family >= 0x7)
return CPUTYPE_NEHALEM;
else
return CPUTYPE_VIAC3;
}
return CPUTYPE_VIAC3;
}
if (vendor == VENDOR_ZHAOXIN){
return CPUTYPE_NEHALEM;
}
if (vendor == VENDOR_RISE){
@@ -1855,7 +1876,7 @@ char *get_lower_cpunamechar(void){
int get_coretype(void){
int family, exfamily, model, exmodel, vendor;
int family, exfamily, model, exmodel, vendor, stepping;
if (!have_cpuid()) return CORE_80486;
@@ -1863,6 +1884,7 @@ int get_coretype(void){
exfamily = get_cputype(GET_EXFAMILY);
model = get_cputype(GET_MODEL);
exmodel = get_cputype(GET_EXMODEL);
stepping = get_cputype(GET_STEPPING);
vendor = get_vendor();
@@ -2112,7 +2134,22 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
#endif
#endif
if (model == 10)
#ifndef NO_AVX512
if(support_avx512_bf16())
return CORE_COOPERLAKE;
return CORE_SKYLAKEX;
#else
if(support_avx())
#ifndef NO_AVX2
return CORE_HASWELL;
#else
return CORE_SANDYBRIDGE;
#endif
else
return CORE_NEHALEM;
#endif
break;
case 7:
if (model == 10)
@@ -2135,13 +2172,13 @@ int get_coretype(void){
case 8:
if (model == 12) { // Tiger Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
return CORE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
return CORE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
return CORE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
return CORE_NEHALEM;
}
if (model == 14) { // Kaby Lake
if(support_avx())
@@ -2257,10 +2294,19 @@ int get_coretype(void){
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
return CORE_NANO;
break;
if (model == 0xf && stepping < 0xe)
return CORE_NANO;
return CORE_NEHALEM;
default:
if (family >= 0x7)
return CORE_NEHALEM;
else
return CORE_VIAC3;
}
return CORE_VIAC3;
}
if (vendor == VENDOR_ZHAOXIN) {
return CORE_NEHALEM;
}
return CORE_UNKNOWN;

View File

@@ -157,6 +157,10 @@ ARCH_ARM64
ARCH_RISCV64
#endif
#ifdef __loongarch64
ARCH_LOONGARCH64
#endif
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
HAVE_C11
#endif

View File

@@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR})
enable_language(Fortran)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
endif()
if(WIN32)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1

View File

@@ -6,6 +6,9 @@ TOPDIR = ..
include $(TOPDIR)/Makefile.system
override CFLAGS += -DADD$(BU) -DCBLAS
ifeq ($(F_COMPILER),GFORTRAN)
override FFLAGS += -fno-tree-vectorize
endif
override TARGET_ARCH=
override TARGET_MACH=

View File

@@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *ku+*kl+2;
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*ku; i++ ){
irow=*ku+*kl-i;
jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
*incx, beta, y, *incy );
else {
LDA = *k+2;
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
beta, y, *incy);
else {
LDA = *n;
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_COMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
x, *incx);
else {
LDA = *k+2;
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
*incx);
else {
LDA = *k+2;
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
else {
LDA = *n;
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
sizeof(CBLAS_TEST_COMPLEX));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
else {
LDA = *n;
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
sizeof(CBLAS_TEST_COMPLEX));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA=*n+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
else {
LDA = *n;
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_COMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
*incy, ap );
else {
LDA = *n;
A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_COMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {

View File

@@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ ) {
for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
if (*order == TEST_ROW_MJR) {
LDA = *ku+*kl+2;
A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*ku; i++ ){
irow=*ku+*kl-i;
jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )

View File

@@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A = (double *)malloc( (*m)*LDA*sizeof( double ) );
A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else {
LDA = *m+1;
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B = ( double* )malloc( (*k)*LDB*sizeof( double ) );
B = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
}
else {
LDB = *k+1;
B = ( double* )malloc( LDB*(*n)*sizeof( double ) );
B = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
}
LDC = *n+1;
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
LDC = *n+1;
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( double* )malloc( (*k)*LDA*sizeof( double ) );
A = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDC = *n+1;
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
B = ( double* )malloc( (*n)*LDB*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
B = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j]=a[j*(*lda)+i];
@@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
else {
LDA = *n+1;
LDB = *n+1;
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
B = ( double* )malloc( LDB*(*k)*sizeof( double ) );
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
B = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j]=a[j*(*lda)+i];
@@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
@@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];

View File

@@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ ) {
for( j=0; j<*n; j++ )
@@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[ LDA*i+j ]=a[ (*lda)*j+i ];
@@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
if (*order == TEST_ROW_MJR) {
LDA = *ku+*kl+2;
A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*ku; i++ ){
irow=*ku+*kl-i;
jcol=(*ku)-i;
@@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *k+1;
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )
@@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
if (*order == TEST_ROW_MJR) {
LDA = *n;
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
for( i=0; i<j+1; i++, k++ )

View File

@@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A = (float *)malloc( (*m)*LDA*sizeof( float ) );
A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else {
LDA = *m+1;
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
if (transb == CblasNoTrans) {
LDB = *n+1;
B = ( float* )malloc( (*k)*LDB*sizeof( float ) );
B = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
}
else {
LDB = *k+1;
B = ( float* )malloc( LDB*(*n)*sizeof( float ) );
B = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
}
LDC = *n+1;
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
LDC = *n+1;
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( float* )malloc( (*k)*LDA*sizeof( float ) );
A = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDC = *n+1;
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
B = ( float* )malloc( (*n)*LDB*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
B = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j]=a[j*(*lda)+i];
@@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
else {
LDA = *n+1;
LDB = *n+1;
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
B = ( float* )malloc( LDB*(*k)*sizeof( float ) );
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
B = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j]=a[j*(*lda)+i];
@@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
@@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];

View File

@@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
get_transpose_type(transp, &trans);
if (*order == TEST_ROW_MJR) {
LDA = *ku+*kl+2;
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*ku; i++ ){
irow=*ku+*kl-i;
jcol=(*ku)-i;
@@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ){
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
*incx, beta, y, *incy );
else {
LDA = *k+2;
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
beta, y, *incy);
else {
LDA = *n;
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_ZOMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
x, *incx);
else {
LDA = *k+2;
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
*incx);
else {
LDA = *k+2;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
if (uplo == CblasUpper) {
for( i=0; i<*k; i++ ){
irow=*k-i;
@@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
else {
LDA = *n;
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
sizeof(CBLAS_TEST_ZOMPLEX));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
else {
LDA = *n;
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
sizeof(CBLAS_TEST_ZOMPLEX));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA=*n+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
@@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
else {
LDA = *n;
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_ZOMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
*incy, ap );
else {
LDA = *n;
A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
sizeof( CBLAS_TEST_ZOMPLEX ));
if (uplo == CblasUpper) {
for( j=0, k=0; j<*n; j++ )
@@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
@@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
if (*order == TEST_ROW_MJR) {
LDA = *n+1;
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {

View File

@@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (transa == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
}
else {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
if (transb == CblasNoTrans) {
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
}
else {
LDB = *k+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ )
A[i*LDA+j]=a[j*(*lda)+i];
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ )
B[i*LDB+j]=b[j*(*ldb)+i];
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( j=0; j<*n; j++ )
for( i=0; i<*m; i++ )
C[i*LDC+j]=c[j*(*ldc)+i];
@@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
if (*order == TEST_ROW_MJR) {
if (trans == CblasNoTrans) {
LDA = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
if (trans == CblasNoTrans) {
LDA = *k+1;
LDB = *k+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*k; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
else {
LDA = *n+1;
LDB = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*k; i++ )
for( j=0; j<*n; j++ ){
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
}
}
LDC = *n+1;
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
C[i*LDC+j].real=c[j*(*ldc)+i].real;
@@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;
@@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
if (*order == TEST_ROW_MJR) {
if (side == CblasLeft) {
LDA = *m+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
for( i=0; i<*m; i++ )
for( j=0; j<*m; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
}
else{
LDA = *n+1;
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*n; i++ )
for( j=0; j<*n; j++ ) {
A[i*LDA+j].real=a[j*(*lda)+i].real;
@@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
}
}
LDB = *n+1;
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
for( i=0; i<*m; i++ )
for( j=0; j<*n; j++ ) {
B[i*LDB+j].real=b[j*(*ldb)+i].real;

View File

@@ -1,3 +1,4 @@
#include "cblas_test.h"
int CBLAS_CallFromC;
int RowMajorStrg;

View File

@@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
@@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)

View File

@@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_ZHAOXIN 5
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
@@ -404,6 +405,7 @@ static int get_vendor(void){
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
@@ -414,7 +416,7 @@ static int get_vendor(void){
static gotoblas_t *get_coretype(void){
int eax, ebx, ecx, edx;
int family, exfamily, model, vendor, exmodel;
int family, exfamily, model, vendor, exmodel, stepping;
cpuid(1, &eax, &ebx, &ecx, &edx);
@@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){
exfamily = BITMASK(eax, 20, 0xff);
model = BITMASK(eax, 4, 0x0f);
exmodel = BITMASK(eax, 16, 0x0f);
stepping = BITMASK(eax, 0, 0x0f);
vendor = get_vendor();
@@ -621,6 +624,22 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM;
}
}
if (model == 10) {
// Ice Lake SP
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 10) // Goldmont Plus
@@ -807,10 +826,19 @@ static gotoblas_t *get_coretype(void){
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
return &gotoblas_NANO;
if (model == 0xf && stepping < 0xe)
return &gotoblas_NANO;
return &gotoblas_NEHALEM;
default:
if (family >= 0x7)
return &gotoblas_NEHALEM;
}
}
if (vendor == VENDOR_ZHAOXIN) {
return &gotoblas_NEHALEM;
}
return NULL;
}

View File

@@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#else
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55;
#else
#define gotoblas_CORTEXA55 gotoblas_ARMV8
#endif
#else
extern gotoblas_t gotoblas_CORTEXA53;
extern gotoblas_t gotoblas_CORTEXA57;
@@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1;
extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55;
#endif
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 12
#define NUM_CORETYPES 13
/*
* In case asm/hwcap.h is outdated on the build system, make sure
@@ -126,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg);
#endif
#define get_cpu_ftr(id, var) ({ \
__asm__ ("mrs %0, "#id : "=r" (var)); \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
})
static char *corename[] = {
@@ -142,6 +148,7 @@ static char *corename[] = {
"emag8180",
"neoversen1",
"thunderx3t110",
"cortexa55",
"unknown"
};
@@ -158,6 +165,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
return corename[NUM_CORETYPES];
}
@@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 9: return (&gotoblas_EMAG8180);
case 10: return (&gotoblas_NEOVERSEN1);
case 11: return (&gotoblas_THUNDERX3T110);
case 12: return (&gotoblas_CORTEXA55);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
@@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_CORTEXA73;
case 0xd0c: // Neoverse N1
return &gotoblas_NEOVERSEN1;
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;
}
break;
case 0x42: // Broadcom

View File

@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
int max_num;
#endif
int blas_goto_num = 0;
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
max_num = get_num_procs();
#endif
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock);
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
error:
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
printf("cpu cores than what OpenBLAS was configured to handle.\n");
return NULL;
}
@@ -1702,7 +1707,6 @@ inline int atoi(const char *str) { return 0; }
#include <sys/sysinfo.h>
#include <sched.h>
#include <errno.h>
#include <linux/unistd.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <sys/resource.h>
@@ -1980,7 +1984,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();
int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
int max_num;
#endif
int blas_goto_num = 0;
@@ -1988,7 +1992,7 @@ int blas_get_cpu_number(void){
if (blas_num_threads) return blas_num_threads;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
max_num = get_num_procs();
#endif
@@ -2012,7 +2016,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif
@@ -2879,8 +2883,12 @@ void *blas_memory_alloc(int procpos){
return (void *)memory[position].addr;
error:
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
printf("cpu cores than what OpenBLAS was configured to handle.\n");
return NULL;
}

View File

@@ -139,9 +139,13 @@ endif
ifneq (,$(filter 1 2,$(NOFORTRAN)))
#only build without Fortran
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
else
ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
else
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
endif
endif
dllinit.$(SUFFIX) : dllinit.c
$(CC) $(CFLAGS) -c -o $(@F) -s $<

18
f_check
View File

@@ -314,11 +314,11 @@ if ($link ne "") {
$link =~ s/\-Y\sP\,/\-Y/g;
$link =~ s/\-R\s*/\-rpath\@/g;
$link =~ s/\-R\s*/\-rpath\%/g;
$link =~ s/\-rpath\s+/\-rpath\@/g;
$link =~ s/\-rpath\s+/\-rpath\%/g;
$link =~ s/\-rpath-link\s+/\-rpath-link\@/g;
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
@flags = split(/[\s\,\n]/, $link);
# remove leading and trailing quotes from each flag.
@@ -344,13 +344,13 @@ if ($link ne "") {
}
if ($flags =~ /^\-rpath\@/) {
$flags =~ s/\@/\,/g;
if ($flags =~ /^\-rpath\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /^\-rpath-link\@/) {
$flags =~ s/\@/\,/g;
if ($flags =~ /^\-rpath-link\%/) {
$flags =~ s/\%/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
@@ -391,10 +391,6 @@ if ($link ne "") {
}
if ($vendor eq "INTEL"){
$linker_a .= "-lgfortran"
}
if ($vendor eq "FLANG"){
$linker_a .= "-lflang"
}

View File

@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON3R5
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON3R5"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON3R5 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
@@ -1159,6 +1174,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_CORTEXA55
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "CORTEXA55"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA55 " \
"-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "cortexa55"
#define CORENAME "CORTEXA55"
#else
#endif
#ifdef FORCE_FALKOR
#define FORCE
@@ -1373,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#ifdef __loongarch64
#include "cpuid_loongarch64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __riscv
#include "cpuid_riscv64.c"
#define OPENBLAS_SUPPORTED
@@ -1448,7 +1483,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@@ -1596,7 +1631,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

View File

@@ -49,6 +49,8 @@
#define ERROR_NAME "QGEMM "
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMM "
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMM "
#else
#define ERROR_NAME "SGEMM "
#endif
@@ -124,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB,
#ifdef SMP
double MNK;
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
@@ -142,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB,
#endif
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
int nodes;

View File

@@ -201,7 +201,14 @@ void CNAME(enum CBLAS_ORDER order,
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
if (alpha == ZERO) return;
#if 0
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) {
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL);
return;
}
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order,
if (m == 0 || n == 0) return;
if (alpha == 0.) return;
if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) {
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
#endif
if ( *lda > *ldb )
msize = (*lda) * (*ldb) * sizeof(FLOAT);
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
else
msize = (*ldb) * (*ldb) * sizeof(FLOAT);
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
b = malloc(msize);
if ( b == NULL )

View File

@@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
#ifdef SMP
args.common = NULL;
args.nthreads = num_cpu_avail(4);
#ifndef DOUBLE
if (args.m*args.n < 40000)
#else
if (args.m*args.n < 10000)
#endif
args.nthreads=1;
else
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {
#endif

View File

@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
args.common = NULL;
#ifndef DOUBLE
if (args.n <128)
#else
if (args.n <64)
#endif
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {

View File

@@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
args.common = NULL;
if (args.n < 180)
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {

View File

@@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
#ifdef SMP
args.common = NULL;
args.nthreads = num_cpu_avail(4);
if (args.m*args.n <10000)
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {
#endif

View File

@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
args.common = NULL;
#ifndef DOUBLE
if (args.n < 64)
#else
if (args.n < 64)
#endif
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {

View File

@@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
#ifdef SMP
args.nthreads = num_cpu_avail(4);
#ifndef DOUBLE
if (args.n < 200)
#else
if (args.n < 150)
#endif
args.nthreads=1;
else
#endif
args.nthreads = num_cpu_avail(4);
if (args.nthreads == 1) {
#endif

View File

@@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order,
FUNCTION_PROFILE_START();
if (incx == 1 && n <100) {
blasint i;
if (uplo==0) {
for (i = 0; i < n; i++){
if (x[i] != ZERO) {
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
}
a += i + 1;
}
} else {
for (i = 0; i < n; i++){
if (x[i] != ZERO) {
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
}
a += n - i;
}
}
return;
}
if (incx < 0 ) x -= (n - 1) * incx;
buffer = (FLOAT *)blas_memory_alloc(1);

View File

@@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order,
if (alpha == ZERO) return;
if (incx == 1 && incy == 1 && n < 50) {
blasint i;
if (!uplo) {
for (i = 0; i < n; i++){
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
a += i + 1;
}
} else {
for (i = 0; i < n; i++){
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
a += n - i;
}
}
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
IDEBUG_START;
FUNCTION_PROFILE_START();
#if 1
if (incx == 1 && n < 100) {
BLASLONG i;
if (uplo == 0) {
for (i = 0; i < n; i++){
if (x[i] != ZERO) {
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
}
a += lda;
}
} else {
for (i = 0; i < n; i++){
if (x[i] != ZERO) {
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
}
a += 1 + lda;
}
}
return;
}
#endif
if (incx < 0 ) x -= (n - 1) * incx;
buffer = (FLOAT *)blas_memory_alloc(1);

View File

@@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
IDEBUG_START;
if (incx == 1 && incy == 1 && n < 100) {
blasint i;
if (!uplo) {
for (i = 0; i < n; i++){
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
a += lda;
}
} else {
for (i = 0; i < n; i++){
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
a += 1 + lda;
}
}
return;
}
FUNCTION_PROFILE_START();
if (incx < 0 ) x -= (n - 1) * incx;

View File

@@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
args.common = NULL;
#ifndef COMPLEX
#ifdef DOUBLE
if (args.n < 100)
#else
if (args.n < 200)
#endif
#else
if (args.n < 65)
#endif
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) {

View File

@@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
#endif
if ( *lda > *ldb )
msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2;
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
else
msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2;
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
b = malloc(msize);
if ( b == NULL )

View File

@@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
aa_i = fabs(da_r);
}
scale = (aa_i / aa_r);
ada = aa_r * sqrt(ONE + scale * scale);
if (aa_r == ZERO) {
ada = 0.;
} else {
scale = (aa_i / aa_r);
ada = aa_r * sqrt(ONE + scale * scale);
}
bb_r = fabs(db_r);
bb_i = fabs(db_i);
@@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
bb_i = fabs(bb_r);
}
scale = (bb_i / bb_r);
adb = bb_r * sqrt(ONE + scale * scale);
if (bb_r == ZERO) {
adb = 0.;
} else {
scale = (bb_i / bb_r);
adb = bb_r * sqrt(ONE + scale * scale);
}
scale = ada + adb;
aa_r = da_r / scale;

View File

@@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
if (incx == 1 && n < 50) {
blasint i;
if (!uplo) {
for (i = 0; i < n; i++){
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
AXPYU_K(i + 1, 0, 0,
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
x, 1, a, 1, NULL, 0);
}
a += lda;
}
} else {
for (i = 0; i < n; i++){
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
AXPYU_K(n - i, 0, 0,
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
x + i * 2, 1, a, 1, NULL, 0);
}
a += 2 + lda;
}
}
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
if (n == 0) return;
if (incx == 1 && trans == 0 && n < 50) {
buffer = NULL;
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();

View File

@@ -1,3 +1,10 @@
FMAFLAG=
ifndef OLDGCC
ifdef HAVE_FMA3
FMAFLAG = -mfma
endif
endif
### GEMV ###
ifndef SGEMVNKERNEL
@@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
endif
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)

View File

@@ -818,6 +818,8 @@ ifeq ($(OS), AIX)
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
else ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
endif
@@ -828,6 +830,8 @@ ifeq ($(OS), AIX)
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
else ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
endif
@@ -838,6 +842,8 @@ ifeq ($(OS), AIX)
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
else ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
endif
@@ -848,6 +854,8 @@ ifeq ($(OS), AIX)
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
else ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
endif
@@ -1044,6 +1052,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
@@ -1054,6 +1064,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
@@ -1064,6 +1076,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
endif
@@ -1074,6 +1088,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
endif
@@ -1084,6 +1100,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
@@ -1094,6 +1112,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
@@ -1104,6 +1124,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
endif
@@ -1114,6 +1136,8 @@ ifeq ($(OS), AIX)
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
else ifeq ($(CORE), SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
@@ -1187,29 +1211,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
ifeq ($(CORE),SANDYBRIDGE)
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
endif

View File

@@ -0,0 +1,196 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMM_BETA = sgemm_beta.S
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
else
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
endif
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
ifeq ($(SGEMM_UNROLL_M), 16)
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
else
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
endif
ifeq ($(SGEMM_UNROLL_M), 4)
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
else
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
endif
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5", "x6",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF
);
}

View File

@@ -270,11 +270,6 @@ All rights reserved.
ldr s1, [A02]
ldr s2, [A03]
ldr s3, [A04]
add A01, A01, #4
add A02, A02, #4
add A03, A03, #4
add A04, A04, #4
stp s0, s1, [B04]
add B04, B04, #8
@@ -285,11 +280,6 @@ All rights reserved.
ldr s5, [A06]
ldr s6, [A07]
ldr s7, [A08]
ldr d4, [A05], #8
ldr d5, [A06], #8
ldr d6, [A07], #8
ldr d7, [A08], #8
stp s4, s5, [B04]
add B04, B04, #8

236
kernel/loongarch64/KERNEL Normal file
View File

@@ -0,0 +1,236 @@
ifndef SAXPYKERNEL
SAXPYKERNEL = ../arm/axpy.c
endif
ifndef DAXPYKERNEL
DAXPYKERNEL = ../arm/axpy.c
endif
ifndef CAXPYKERNEL
CAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef ZAXPYKERNEL
ZAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef SROTKERNEL
SROTKERNEL = ../arm/rot.c
endif
ifndef DROTKERNEL
DROTKERNEL = ../arm/rot.c
endif
ifndef CROTKERNEL
CROTKERNEL = ../arm/zrot.c
endif
ifndef ZROTKERNEL
ZROTKERNEL = ../arm/zrot.c
endif
ifndef CSWAPKERNEL
CSWAPKERNEL = ../arm/zswap.c
endif
ifndef ZSWAPKERNEL
ZSWAPKERNEL = ../arm/zswap.c
endif
ifndef SSUMKERNEL
SSUMKERNEL = ../arm/sum.c
endif
ifndef DSUMKERNEL
DSUMKERNEL = ../arm/sum.c
endif
ifndef CSUMKERNEL
CSUMKERNEL = ../arm/zsum.c
endif
ifndef ZSUMKERNEL
ZSUMKERNEL = ../arm/zsum.c
endif
ifndef ISMAXKERNEL
ISMAXKERNEL = ../arm/imax.c
endif
ifndef IDMAXKERNEL
IDMAXKERNEL = ../arm/imax.c
endif
ifndef ISMINKERNEL
ISMINKERNEL = ../arm/imin.c
endif
ifndef IDMINKERNEL
IDMINKERNEL = ../arm/imin.c
endif
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif
ifndef DNRM2KERNEL
DNRM2KERNEL = dnrm2.S
endif
ifndef CNRM2KERNEL
CNRM2KERNEL = cnrm2.S
endif
ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.S
endif
ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif
ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif
ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif
ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif
ifndef SGEMMKERNEL
SGEMMKERNEL = gemm_kernel.S
SGEMMINCOPY = ../generic/gemm_ncopy_2.c
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
endif
ifndef DGEMMKERNEL
DGEMMKERNEL = gemm_kernel.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
endif
ifndef CGEMMKERNEL
CGEMMKERNEL = zgemm_kernel.S
CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
endif
ifndef ZGEMMKERNEL
ZGEMMKERNEL = zgemm_kernel.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
endif
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef CGEMM3MKERNEL
CGEMM3MKERNEL = zgemm3m_kernel.S
endif
ifndef ZGEMM3MKERNEL
ZGEMM3MKERNEL = zgemm3m_kernel.S
endif

View File

@@ -0,0 +1 @@
#TODO: Add loongarch64 SIMD optimizations

View File

@@ -0,0 +1,167 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../generic/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@@ -0,0 +1 @@
clean ::

230
kernel/loongarch64/amax.S Normal file
View File

@@ -0,0 +1,230 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

186
kernel/loongarch64/amin.S Normal file
View File

@@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
NOP
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

232
kernel/loongarch64/asum.S Normal file
View File

@@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define a5 $f12
#define a6 $f13
#define a7 $f14
#define a8 $f15
#define t1 $f16
#define t2 $f17
#define t3 $f0
#define t4 $f1
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li TEMP, SIZE
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
FABS t1, a1
LD a6, X, 5 * SIZE
FABS t2, a2
LD a7, X, 6 * SIZE
FABS t3, a3
FABS t4, a4
addi.d I, I, -1
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ADD s1, s1, t1
LD a1, X, 8 * SIZE
FABS t1, a5
addi.d I, I, -1
ADD s2, s2, t2
LD a2, X, 9 * SIZE
FABS t2, a6
NOP
ADD s1, s1, t3
LD a3, X, 10 * SIZE
FABS t3, a7
NOP
ADD s2, s2, t4
LD a4, X, 11 * SIZE
FABS t4, a8
addi.d X, X, 8 * SIZE
ADD s1, s1, t1
LD a5, X, 4 * SIZE
FABS t1, a1
NOP
ADD s2, s2, t2
LD a6, X, 5 * SIZE
FABS t2, a2
NOP
ADD s1, s1, t3
LD a7, X, 6 * SIZE
FABS t3, a3
NOP
ADD s2, s2, t4
LD a8, X, 7 * SIZE
FABS t4, a4
blt $r0, I, .L12
.align 3
.L13:
ADD s1, s1, t1
addi.d X, X, 8 * SIZE
FABS t1, a5
NOP
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
ADD s1, s1, t1
addi.d X, X, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
bge $r0, I, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
LD a7, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
ADD s1, s1, t1
LD a1, X, 0 * SIZE
FABS t1, a5
add.d X, X, INCX
ADD s2, s2, t2
LD a2, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
ADD s1, s1, t3
LD a3, X, 0 * SIZE
FABS t3, a7
add.d X, X, INCX
ADD s2, s2, t4
LD a4, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
ADD s1, s1, t1
LD a5, X, 0 * SIZE
FABS t1, a1
add.d X, X, INCX
ADD s2, s2, t2
LD a6, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t3
LD a7, X, 0 * SIZE
FABS t3, a3
add.d X, X, INCX
ADD s2, s2, t4
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
.align 3
.L24:
ADD s1, s1, t1
FABS t1, a5
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
add.d X, X, INCX
ADD s1, s1, t1
blt $r0, I, .L26
.align 3
.L999:
ADD s1, s1, s2
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

159
kernel/loongarch64/cnrm2.S Normal file
View File

@@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define a5 $f16
#define a6 $f17
#define a7 $f0
#define a8 $f1
#define s1 $f22
#define s2 $f8
#define t1 $f23
#define t2 $f9
#define t3 $f10
#define t4 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
movgr2fr.d s1, $r0
li TEMP, 2 * SIZE
fmov.d s2, s1
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
srai.d I, N, 2
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
fcvt.d.s t1, a1
LD a7, X, 0 * SIZE
fcvt.d.s t2, a2
LD a8, X, 1 * SIZE
fcvt.d.s t3, a3
addi.d I, I, -1
fcvt.d.s t4, a4
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
fmadd.d s1, t1, t1, s1
LD a1, X, 0 * SIZE
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
LD a2, X, 1 * SIZE
fcvt.d.s t2, a6
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a3, X, 0 * SIZE
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
LD a4, X, 1 * SIZE
fcvt.d.s t4, a8
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
LD a5, X, 0 * SIZE
fcvt.d.s t1, a1
addi.d I, I, -1
fmadd.d s2, t2, t2, s2
LD a6, X, 1 * SIZE
fcvt.d.s t2, a2
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a7, X, 0 * SIZE
fcvt.d.s t3, a3
LD a8, X, 1 * SIZE
fmadd.d s2, t4, t4, s2
add.d X, X, INCX
fcvt.d.s t4, a4
blt $r0, I, .L23
.align 3
.L24:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fcvt.d.s t2, a2
fmadd.d s1, t1, t1, s1
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
blt $r0, I, .L26
.align 3
.L999:
fadd.d s1, s1, s2
fsqrt.d s1, s1
move $r4, $r17
fcvt.s.d $f0, s1
jirl $r0, $r1, 0x0
EPILOGUE

225
kernel/loongarch64/copy.S Normal file
View File

@@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
li TEMP, SIZE
NOP
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, BASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 3
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
ST a3, Y, 2 * SIZE
ST a4, Y, 3 * SIZE
ST a5, Y, 4 * SIZE
ST a6, Y, 5 * SIZE
ST a7, Y, 6 * SIZE
ST a8, Y, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d X, X, SIZE
addi.d I, I, -1
addi.d Y, Y, SIZE
ST a1, Y, -1 * SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
srai.d I, N, 3
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
add.d X, X, INCX
bge $r0, I, .L23
.align 3
.L22:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L23:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
addi.d I, I, -1
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

314
kernel/loongarch64/dnrm2.S Normal file
View File

@@ -0,0 +1,314 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r7
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define ALPHA $f4
#define max $f5
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
move XX, X
NOP
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L100
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L100:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
addi.d N, N, 1
lu12i.w TEMP, 0x3f800
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, s1, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, s1
MOV max, s1
MOV s1, a1
MOV s2, a1
MOV s3, a1
MOV s4, a1
srai.d I, N, 3
bge $r0, I, .L105
LD a1, XX, 0 * SIZE
add.d XX, XX, INCX
LD a2, XX, 0 * SIZE
add.d XX, XX, INCX
LD a3, XX, 0 * SIZE
add.d XX, XX, INCX
LD a4, XX, 0 * SIZE
add.d XX, XX, INCX
LD a5, XX, 0 * SIZE
add.d XX, XX, INCX
LD a6, XX, 0 * SIZE
add.d XX, XX, INCX
LD a7, XX, 0 * SIZE
add.d XX, XX, INCX
LD a8, XX, 0 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
bge $r0, I, .L104
.align 3
.L103:
MUL t1, ALPHA, a1
LD a1, XX, 0 * SIZE
MUL t2, ALPHA, a2
add.d XX, XX, INCX
MUL t3, ALPHA, a3
LD a2, XX, 0 * SIZE
MUL t4, ALPHA, a4
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a3, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a4, XX, 0 * SIZE
MADD s4, t4, t4, s4
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, XX, 0 * SIZE
MUL t2, ALPHA, a6
add.d XX, XX, INCX
MUL t3, ALPHA, a7
LD a6, XX, 0 * SIZE
MUL t4, ALPHA, a8
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a7, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a8, XX, 0 * SIZE
MADD s4, t4, t4, s4
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L103
.align 3
.L104:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
.align 3
.L105:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L106:
LD a1, XX, 0 * SIZE
addi.d I, I, -1
MUL t1, ALPHA, a1
add.d XX, XX, INCX
MADD s1, t1, t1, s1
blt $r0, I, .L106
.align 3
.L998:
ADD s1, s1, s2
ADD s3, s3, s4
ADD s1, s1, s3
fsqrt.d s1, s1
move $r4, $r17
MUL $f0, max, s1
jirl $r0, $r1, 0x0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

391
kernel/loongarch64/dot.S Normal file
View File

@@ -0,0 +1,391 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define b1 $f12
#define b2 $f13
#define b3 $f14
#define b4 $f15
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
LD a2, X, 1 * SIZE
LD b2, Y, 1 * SIZE
LD a3, X, 2 * SIZE
LD b3, Y, 2 * SIZE
LD a4, X, 3 * SIZE
addi.d I, I, -1
LD b4, Y, 3 * SIZE
bge $r0, I, .L13
.align 3
.L12:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 8 * SIZE
LD b1, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 9 * SIZE
LD b2, Y, 9 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 10 * SIZE
LD b3, Y, 10 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 11 * SIZE
LD b4, Y, 11 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d X, X, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
addi.d Y, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

531
kernel/loongarch64/gemv_n.S Normal file
View File

@@ -0,0 +1,531 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define YORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define x1 $f14
#define x2 $f15
#define y1 $f16
#define y2 $f17
#define y3 $f3
#define y4 $f1
#define y5 $f2
#define y6 $f4
#define y7 $f5
#define y8 $f6
#define t1 $f7
#define t2 $f18
#define t3 $f19
#define t4 $f20
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -48
#endif
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
fst.d $f19, $sp, 24
fst.d $f20, $sp, 32
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li I, SIZE
move YORIG, Y
beq INCY, I, .L10
srai.d I, M, 2
move YORIG, BUFFER
move XX, Y
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
LD a2, XX, 0 * SIZE
add.d XX, XX, INCY
LD a3, XX, 0 * SIZE
add.d XX, XX, INCY
LD a4, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
bge $r0, J, .L20
.align 3
.L11:
LD x1, X, 0 * SIZE
add.d X, X, INCX
LD x2, X, 0 * SIZE
add.d X, X, INCX
move AO1, A
add.d AO2, A, LDA
add.d A, AO2, LDA
move YY, YORIG
MUL x1, ALPHA, x1
srai.d I, M, 3
MUL x2, ALPHA, x2
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
LD y5, YY, 4 * SIZE
LD a6, AO2, 1 * SIZE
LD y6, YY, 5 * SIZE
LD a7, AO2, 2 * SIZE
LD y7, YY, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 8 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 9 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 10 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 11 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
MADD t1, a5, x2, t1
addi.d AO1, AO1, 8 * SIZE
MADD t2, a6, x2, t2
addi.d AO2, AO2, 8 * SIZE
MADD t3, a7, x2, t3
addi.d YY, YY, 8 * SIZE
MADD t4, a8, x2, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L16
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 1 * SIZE
MADD y2, a2, x1, y2
LD a7, AO2, 2 * SIZE
MADD y3, a3, x1, y3
LD a8, AO2, 3 * SIZE
MADD y4, a4, x1, y4
MADD y1, a5, x2, y1
addi.d YY, YY, 4 * SIZE
MADD y2, a6, x2, y2
addi.d AO1, AO1, 4 * SIZE
MADD y3, a7, x2, y3
addi.d AO2, AO2, 4 * SIZE
MADD y4, a8, x2, y4
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L16:
andi I, M, 2
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a5, AO2, 0 * SIZE
LD a6, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
addi.d YY, YY, 2 * SIZE
MADD y1, a5, x2, y1
addi.d AO1, AO1, 2 * SIZE
MADD y2, a6, x2, y2
addi.d AO2, AO2, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L17:
andi I, M, 1
bge $r0, I, .L19
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
MADD y1, a5, x2, y1
ST y1, YY, 0 * SIZE
.align 3
.L19:
addi.d J, J, -1
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
bge $r0, J, .L900
.align 3
.L21:
LD x1, X, 0 * SIZE
add.d X, X, INCX
move YY, YORIG
move AO1, A
srai.d I, M, 3
MUL x1, ALPHA, x1
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD y5, YY, 4 * SIZE
LD y6, YY, 5 * SIZE
LD y7, YY, 6 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d YY, YY, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L26
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
MADD y3, a3, x1, y3
addi.d YY, YY, 4 * SIZE
MADD y4, a4, x1, y4
addi.d AO1, AO1, 4 * SIZE
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L26:
andi I, M, 2
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
MADD y1, a1, x1, y1
addi.d YY, YY, 2 * SIZE
MADD y2, a2, x1, y2
addi.d AO1, AO1, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L27:
andi I, M, 1
bge $r0, I, .L900
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
MADD y1, a1, x1, y1
ST y1, YY, 0 * SIZE
.align 3
.L900:
li YORIG, SIZE
srai.d I, M, 2
beq INCY, YORIG, .L999
move XX, BUFFER
bge $r0, I, .L905
.align 3
.L902:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
LD a3, XX, 2 * SIZE
LD a4, XX, 3 * SIZE
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
addi.d XX, XX, 4 * SIZE
blt $r0, I, .L902
.align 3
.L905:
andi I, M, 3
bge $r0, I, .L999
.align 3
.L906:
LD a1, XX, 0 * SIZE
addi.d XX, XX, 1 * SIZE
ST a1, Y, 0 * SIZE
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L906
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
fld.d $f19, $sp, 24
fld.d $f20, $sp, 32
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 48
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

436
kernel/loongarch64/gemv_t.S Normal file
View File

@@ -0,0 +1,436 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define XORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define y1 $f14
#define y2 $f15
#define y3 $f16
#define y4 $f17
#define x1 $f3
#define x2 $f1
#define x3 $f2
#define x4 $f4
#define x5 $f5
#define x6 $f6
#define x7 $f7
#define x8 $f18
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -32
#endif
MTC y1, $r0
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li I, SIZE
move XORIG, X
beq INCX, I, .L10
srai.d I, M, 2
move XORIG, BUFFER
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
move YY, Y
bge $r0, J, .L20
.align 3
.L11:
move AO1, A
MOV y2, y1
add.d AO2, A, LDA
MOV y3, y1
add.d A, AO2, LDA
MOV y4, y1
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD x2, XX, 1 * SIZE
LD a3, AO1, 1 * SIZE
LD x3, XX, 2 * SIZE
LD a4, AO2, 1 * SIZE
LD x4, XX, 3 * SIZE
LD a5, AO1, 2 * SIZE
LD x5, XX, 4 * SIZE
LD a6, AO2, 2 * SIZE
LD x6, XX, 5 * SIZE
LD a7, AO1, 3 * SIZE
LD x7, XX, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y2, a2, x5, y2
LD a2, AO2, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
MADD y4, a4, x6, y4
LD a4, AO2, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y2, a6, x7, y2
LD a6, AO2, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
MADD y4, a8, x8, y4
LD a8, AO2, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
MADD y1, a1, x5, y1
MADD y2, a2, x5, y2
MADD y3, a3, x6, y3
MADD y4, a4, x6, y4
MADD y1, a5, x7, y1
addi.d XX, XX, 8 * SIZE
MADD y2, a6, x7, y2
addi.d AO1, AO1, 8 * SIZE
MADD y3, a7, x8, y3
addi.d AO2, AO2, 8 * SIZE
MADD y4, a8, x8, y4
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 2 * SIZE
MADD y2, a2, x1, y2
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y4, a4, x2, y4
LD a8, AO2, 3 * SIZE
MADD y1, a5, x3, y1
MADD y2, a6, x3, y2
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
MADD y4, a8, x4, y4
addi.d AO2, AO2, 4 * SIZE
.align 3
.L17:
andi I, M, 3
ADD y1, y1, y3
ADD y2, y2, y4
bge $r0, I, .L19
.align 3
.L18:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO2, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
addi.d AO2, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
blt $r0, I, .L18
.align 3
.L19:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
addi.d J, J, -1
MADD a2, y2, ALPHA, a2
MTC y1, $r0
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
MOV y3, y1
move AO1, A
bge $r0, J, .L999
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
LD a7, AO1, 3 * SIZE
LD x4, XX, 3 * SIZE
LD x5, XX, 4 * SIZE
LD x6, XX, 5 * SIZE
LD x7, XX, 6 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y1, a1, x5, y1
MADD y3, a3, x6, y3
MADD y1, a5, x7, y1
MADD y3, a7, x8, y3
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y1, a5, x3, y1
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
.align 3
.L27:
andi I, M, 3
ADD y1, y1, y3
bge $r0, I, .L29
.align 3
.L28:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
MADD y1, a1, x1, y1
blt $r0, I, .L28
.align 3
.L29:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 32
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamax.S Normal file
View File

@@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamin.S Normal file
View File

@@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamax.S Normal file
View File

@@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li x2, 1
srai.d I, N, 2
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t3
LD a8, X, 1 * SIZE
CMPLT $fcc2, s3, t5
add.d X, X, INCX
CMPLT $fcc3, s4, t7
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t3
CMPLT $fcc2, s3, t5
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamin.S Normal file
View File

@@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li x2, 1
srai.d I, N, 2
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t3, s2
LD a8, X, 1 * SIZE
CMPLT $fcc2, t5, s3
add.d X, X, INCX
CMPLT $fcc3, t7, s4
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t3, s2
CMPLT $fcc2, t5, s3
CMPLT $fcc3, t7, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

Some files were not shown because too many files have changed in this diff Show More