Compare commits
285 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
898212efcd | ||
|
|
210a1584c5 | ||
|
|
f2a7a67f5a | ||
|
|
e0e88f9edc | ||
|
|
5dc6aa74f0 | ||
|
|
e78fbe4654 | ||
|
|
b4f4ed378b | ||
|
|
cbc41973fd | ||
|
|
1b6db3dbba | ||
|
|
f681553c6a | ||
|
|
afadeeba2a | ||
|
|
02d4a49761 | ||
|
|
4d7dfe4845 | ||
|
|
af0a69f355 | ||
|
|
5a2fe5bfb9 | ||
|
|
342d3e8b5c | ||
|
|
efbd7c7840 | ||
|
|
3a7955cd93 | ||
|
|
47ba85f314 | ||
|
|
30f23be0f9 | ||
|
|
49bbf330ca | ||
|
|
38d5b4b124 | ||
|
|
6e3fbe8ac5 | ||
|
|
86273392e5 | ||
|
|
d909f9f3d4 | ||
|
|
12d3d94e2e | ||
|
|
f349be3bdb | ||
|
|
4777eb678f | ||
|
|
415876d117 | ||
|
|
da8435dc36 | ||
|
|
4c7065f3ee | ||
|
|
f62bfaafe8 | ||
|
|
d947116390 | ||
|
|
f176ff90af | ||
|
|
f4d4abd423 | ||
|
|
2b9443b7e7 | ||
|
|
fe0e66564e | ||
|
|
a6351e32f0 | ||
|
|
5b4b385ecf | ||
|
|
1dea57ab25 | ||
|
|
54ffe280df | ||
|
|
029d1e16b9 | ||
|
|
ea8e208029 | ||
|
|
0fca36c8c3 | ||
|
|
44cc7cdecc | ||
|
|
6492131792 | ||
|
|
6c8ec55fb7 | ||
|
|
fab746240c | ||
|
|
847607c768 | ||
|
|
4c81d1c3fe | ||
|
|
db4908ebfa | ||
|
|
ed3eb18cb2 | ||
|
|
239ff330f8 | ||
|
|
19c81a07cb | ||
|
|
e008646ba9 | ||
|
|
498479b13e | ||
|
|
b4cbfe6677 | ||
|
|
be1a42507c | ||
|
|
7bb59fceb7 | ||
|
|
eba2cd951e | ||
|
|
836c7fb9f5 | ||
|
|
d2693eac04 | ||
|
|
8acb6fe3a8 | ||
|
|
c47e35acee | ||
|
|
a27a61bb9a | ||
|
|
69560ad3ce | ||
|
|
b2319fd97a | ||
|
|
0266ba7cb6 | ||
|
|
7e09570e04 | ||
|
|
14e33e0f7e | ||
|
|
db57c449dc | ||
|
|
993e56b7b3 | ||
|
|
c9304199cf | ||
|
|
d86290edf0 | ||
|
|
89429fdaa2 | ||
|
|
d511063098 | ||
|
|
4f4e286bf6 | ||
|
|
ddb6cee0d5 | ||
|
|
cecc2c65aa | ||
|
|
220f6a1c55 | ||
|
|
2f6326a630 | ||
|
|
c0d0406b97 | ||
|
|
8f22ac552b | ||
|
|
da623ae838 | ||
|
|
eb2fdd3af0 | ||
|
|
0d8d261dd4 | ||
|
|
40caaef052 | ||
|
|
25b602d8a6 | ||
|
|
4ed99c2ce3 | ||
|
|
f20c4edc33 | ||
|
|
3cfdb1770c | ||
|
|
8186963d8c | ||
|
|
a4543e4918 | ||
|
|
2376aa1e8c | ||
|
|
4620f98812 | ||
|
|
726c44242b | ||
|
|
dcfc5cf714 | ||
|
|
06e3b07ecb | ||
|
|
623be6600a | ||
|
|
7ddc9d384c | ||
|
|
6ebcce229f | ||
|
|
1b5620b66e | ||
|
|
1f8bda71b9 | ||
|
|
3be660c000 | ||
|
|
1a8b6134c2 | ||
|
|
f0b822a709 | ||
|
|
130327e9af | ||
|
|
750719528a | ||
|
|
91e2b11d3c | ||
|
|
548aa522e5 | ||
|
|
6423b282a1 | ||
|
|
9335d42740 | ||
|
|
39ef0880ae | ||
|
|
b7da75e4fd | ||
|
|
a7627c5afd | ||
|
|
9499ab0d45 | ||
|
|
307c4c0786 | ||
|
|
e83df93975 | ||
|
|
13fa9f737d | ||
|
|
5958ffc9b6 | ||
|
|
cd0e4aadb1 | ||
|
|
e2621ef93a | ||
|
|
9e1b43ea9b | ||
|
|
5269348178 | ||
|
|
92e024bbb3 | ||
|
|
c4b464cac6 | ||
|
|
e6dd44d989 | ||
|
|
baf03a0937 | ||
|
|
7aab5e826c | ||
|
|
29417adf4c | ||
|
|
9d292d37b2 | ||
|
|
2e8ff4a781 | ||
|
|
dbba381dc3 | ||
|
|
f61991d439 | ||
|
|
efdbdd8f82 | ||
|
|
3906ef3b0f | ||
|
|
8adf0971d8 | ||
|
|
08e2e60762 | ||
|
|
fb9e678235 | ||
|
|
dc4fcb48df | ||
|
|
7a48247761 | ||
|
|
7dfc45e840 | ||
|
|
7fb6e576c2 | ||
|
|
cbb70438df | ||
|
|
706a08d4a0 | ||
|
|
9f3d903817 | ||
|
|
590be3fae3 | ||
|
|
3521cd48cb | ||
|
|
1e0192a5cc | ||
|
|
fe9aff17fe | ||
|
|
8c25b440a0 | ||
|
|
f84197c1a7 | ||
|
|
734bd265a8 | ||
|
|
1217eb910d | ||
|
|
d6d7a6685d | ||
|
|
f0e7345fb8 | ||
|
|
42f048cf6c | ||
|
|
4fbc0777f4 | ||
|
|
d7472606d5 | ||
|
|
03297ff9f0 | ||
|
|
2d8d0af0ea | ||
|
|
5f677e782e | ||
|
|
04c60cee5d | ||
|
|
3a53207cc9 | ||
|
|
0e73d20629 | ||
|
|
02087a62e7 | ||
|
|
03b4d79a7e | ||
|
|
5c729c6dce | ||
|
|
e1911b2e60 | ||
|
|
8f33da4f94 | ||
|
|
26ccf643a3 | ||
|
|
32264ba496 | ||
|
|
4ecf631f95 | ||
|
|
5af510081d | ||
|
|
164551d5a2 | ||
|
|
310b76aad7 | ||
|
|
c4da892ba0 | ||
|
|
cbfd3c87e1 | ||
|
|
26e87ac517 | ||
|
|
15b9d6b4a7 | ||
|
|
f7bcd962c1 | ||
|
|
93cc066921 | ||
|
|
2c7d4a7766 | ||
|
|
eef1c42f03 | ||
|
|
73f637e584 | ||
|
|
8b90e5f202 | ||
|
|
bd60fb6ffc | ||
|
|
37ea8702ee | ||
|
|
ec7d6c02bc | ||
|
|
c90c23e78f | ||
|
|
bda8820da7 | ||
|
|
c0ca63ea46 | ||
|
|
f497bb949b | ||
|
|
f86b1bc3da | ||
|
|
206e03fdac | ||
|
|
8b599836db | ||
|
|
9721b57ecf | ||
|
|
380f955078 | ||
|
|
49d18e65e3 | ||
|
|
904f9a267d | ||
|
|
4c033730bb | ||
|
|
65502c6af6 | ||
|
|
f71627fa2e | ||
|
|
d8d7bd33cb | ||
|
|
e72420e8c5 | ||
|
|
d00709e016 | ||
|
|
d444344497 | ||
|
|
fb7308b9b5 | ||
|
|
db50b24a4a | ||
|
|
88b70fba3e | ||
|
|
4c1d47098b | ||
|
|
40000d1f64 | ||
|
|
dc3664993c | ||
|
|
b8232c9054 | ||
|
|
114bbbc6d7 | ||
|
|
b67a92c19f | ||
|
|
4bf00da8fb | ||
|
|
c26780d451 | ||
|
|
d77d9bc920 | ||
|
|
37d3e2bd94 | ||
|
|
de8656769c | ||
|
|
d43e07198d | ||
|
|
da16764c7a | ||
|
|
98ebc8ac59 | ||
|
|
904b221f03 | ||
|
|
5cc35abc3d | ||
|
|
254774f5a6 | ||
|
|
ae9cdee753 | ||
|
|
53ee0b76bb | ||
|
|
dc6b04c375 | ||
|
|
3d4ccd2a13 | ||
|
|
c59652f0ce | ||
|
|
87d2e314db | ||
|
|
3a30c12019 | ||
|
|
c9a82f54d1 | ||
|
|
444cb78be5 | ||
|
|
171c20e3b6 | ||
|
|
c5fb91f1bc | ||
|
|
9a36a283d3 | ||
|
|
7e35d25ea0 | ||
|
|
3704f5e5b0 | ||
|
|
6b76066632 | ||
|
|
2b01132515 | ||
|
|
8e95a1e18d | ||
|
|
aa7b3dc3db | ||
|
|
13a29d13fd | ||
|
|
a6c2cb8417 | ||
|
|
d511a7bb4f | ||
|
|
3526ff2507 | ||
|
|
adcfe7b789 | ||
|
|
ceb44bef14 | ||
|
|
ed473267df | ||
|
|
0608bc5d82 | ||
|
|
3d511f0e66 | ||
|
|
0b8a436af9 | ||
|
|
352efdd13a | ||
|
|
4855af02a3 | ||
|
|
94a5a1f0f1 | ||
|
|
751d127d7c | ||
|
|
fc101b67e5 | ||
|
|
b0239a05fd | ||
|
|
623d580b4c | ||
|
|
974acb39ff | ||
|
|
2379abaa5e | ||
|
|
3caf781d7c | ||
|
|
55bb9f639a | ||
|
|
0dba04bb58 | ||
|
|
e96f5e3c65 | ||
|
|
558724e99f | ||
|
|
067c96a873 | ||
|
|
4b380c0b40 | ||
|
|
2dfb24730d | ||
|
|
725432efaa | ||
|
|
a2216ef19f | ||
|
|
5332cbae18 | ||
|
|
209b026e46 | ||
|
|
1ae607beca | ||
|
|
d393f1923f | ||
|
|
081d5ae971 | ||
|
|
0492f0f3f9 | ||
|
|
147e0a75fd | ||
|
|
ee068af843 | ||
|
|
2dbcddd83d | ||
|
|
d2bda3b56a | ||
|
|
903fd85c85 |
5
.github/workflows/nightly-Homebrew-build.yml
vendored
5
.github/workflows/nightly-Homebrew-build.yml
vendored
@@ -43,11 +43,6 @@ jobs:
|
||||
- name: Update Homebrew
|
||||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
- name: unlink installed gcc to allow updating
|
||||
run: |
|
||||
brew unlink gcc@8
|
||||
brew unlink gcc@9
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 14.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 17.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
||||
@@ -194,3 +194,6 @@ In chronological order:
|
||||
|
||||
* PingTouGe Semiconductor Co., Ltd.
|
||||
* [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910
|
||||
|
||||
* River Dillon <oss@outerpassage.net>
|
||||
* [2021-07-10] fix compilation with musl libc
|
||||
|
||||
113
Changelog.txt
113
Changelog.txt
@@ -1,4 +1,117 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.17
|
||||
15-Jul-2021
|
||||
|
||||
common:
|
||||
- reverted the optimization of SGEMV_N/DGEMV_N for small input sizes
|
||||
and consecutive arguments as it led to stack overflows on x86_64
|
||||
with some operating systems (notably OSX and Windows)
|
||||
|
||||
x86_64:
|
||||
- reverted the performance patch for SGEMV_T on AVX512 as it caused
|
||||
wrong results in some applications
|
||||
|
||||
SPARC:
|
||||
- fixed compilation with compilers other than gcc
|
||||
====================================================================
|
||||
Version 0.3.16
|
||||
11-Jul-2021
|
||||
|
||||
common:
|
||||
- drastically reduced the stack size requirements for running the LAPACK
|
||||
testsuite (Reference-LAPACK PR 553)
|
||||
- fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK
|
||||
PR 564)
|
||||
- expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode
|
||||
- improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N
|
||||
and DGEMV_N, for small input sizes and consecutive arguments
|
||||
- improved performance of xGETRF, xPORTF and xPOTRI for small input sizes
|
||||
by disabling multithreading
|
||||
- fixed installing with BSD versions of the "install" utility
|
||||
|
||||
RISCV:
|
||||
- fixed the implementation of xIMIN
|
||||
- improved the performance of DSDOT
|
||||
- fixed linking of the tests on C910V with current vendor gcc
|
||||
|
||||
POWER:
|
||||
- fixed SBGEMM computation for some odd value inputs
|
||||
- fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5
|
||||
|
||||
x86_64:
|
||||
- improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus
|
||||
- worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc
|
||||
versions
|
||||
- fixed compilation with MS Visual Studio versions older than 2017
|
||||
- fixed macro name collision with winnt.h from the latest Win10 SDK
|
||||
- added cpu type autodetection for Intel Ice Lake SP
|
||||
- fixed cpu type autodetection for Intel Tiger Lake
|
||||
- added cpu type autodetection for recent Centaur/Zhaoxin models
|
||||
- fixed compilation with musl libc
|
||||
|
||||
ARM64:
|
||||
- fixed compilation with gcc/gfortran on the Apple M1
|
||||
- fixed linking of the tests on FreeBSD
|
||||
- fixed missing restore of a register in the recently rewritten DNRM2 kernel
|
||||
for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g.
|
||||
DGEEV
|
||||
- added compiler optimization flags for the EMAG8180
|
||||
- added initial support for Cortex A55
|
||||
|
||||
ARM:
|
||||
- fixed linking of the tests on FreeBSD
|
||||
|
||||
====================================================================
|
||||
Version 0.3.15
|
||||
2-May-2021
|
||||
|
||||
common:
|
||||
- imported improvements and bugfixes from Reference-LAPACK 3.9.1
|
||||
- imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537
|
||||
- fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation
|
||||
- fixed a sequence problem in the generation of softlinks to the library in GMAKE
|
||||
|
||||
RISC V:
|
||||
- fixed compilation on RISCV (missing entry in getarch)
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
POWER:
|
||||
- fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler
|
||||
- improved CGEMM, DGEMM and ZGEMM performance on POWER10
|
||||
- added an optimized ZGEMV kernel for POWER10
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
x86_64:
|
||||
- added support for Intel Control-flow Enforcement Technology (CET)
|
||||
- reverted the DOMATCOPY_RT code to the generic C version
|
||||
- fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14
|
||||
- fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH
|
||||
- added support for compilation of the benchmarks on older OSX versions
|
||||
- fix propagation of the NO_AVX512 option in CMAKE builds
|
||||
- fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows
|
||||
- fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX)
|
||||
- corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512
|
||||
|
||||
ARM:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
ARM64:
|
||||
- fixed spurious reads outside the array in the SGEMM tcopy macro
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14)
|
||||
|
||||
MIPS
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
- fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs
|
||||
|
||||
MIPS64:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
SPARC:
|
||||
- fixed a potential division by zero in CROTG and ZROTG
|
||||
|
||||
====================================================================
|
||||
Version 0.3.14
|
||||
17-Mar-2021
|
||||
|
||||
2
Makefile
2
Makefile
@@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1)
|
||||
$(error OpenBLAS: neither static nor shared are enabled.)
|
||||
endif
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@for d in $(SUBDIRS) ; \
|
||||
do if test -d $$d; then \
|
||||
$(MAKE) -C $$d $(@F) || exit 1 ; \
|
||||
@@ -196,6 +195,7 @@ endif
|
||||
ifdef USE_THREAD
|
||||
@echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last
|
||||
endif
|
||||
@-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
@touch lib.grd
|
||||
|
||||
prof : prof_blas prof_lapack
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
ifneq ($(GCCVERSIONGT4), 1)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
|
||||
else
|
||||
|
||||
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
@@ -57,6 +68,28 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a53 tunings because a55 is only available in GCC>=8.1
|
||||
ifeq ($(CORE), CORTEXA55)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ8), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
@@ -107,4 +140,16 @@ FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), EMAG8180)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=emag
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
endif
|
||||
@@ -74,17 +74,17 @@ endif
|
||||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
@-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h"
|
||||
endif
|
||||
|
||||
#for install static library
|
||||
ifneq ($(NO_STATIC),1)
|
||||
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
|
||||
endif
|
||||
@@ -92,7 +92,7 @@ endif
|
||||
ifneq ($(NO_SHARED),1)
|
||||
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
|
||||
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
|
||||
3
Makefile.loongarch64
Normal file
3
Makefile.loongarch64
Normal file
@@ -0,0 +1,3 @@
|
||||
ifdef BINARY64
|
||||
else
|
||||
endif
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.14.dev
|
||||
VERSION = 0.3.17.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
||||
@@ -333,6 +333,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11)
|
||||
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
|
||||
@@ -380,6 +381,12 @@ ifeq ($(OSNAME), AIX)
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
|
||||
EXTRALIB += -lm
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
NEED_PIC = 0
|
||||
NO_EXPRECISION = 1
|
||||
@@ -619,6 +626,7 @@ DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += NEOVERSEN1
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
@@ -772,6 +780,11 @@ NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
NO_BINARY_MODE = 1
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
|
||||
#
|
||||
# C Compiler dependent settings
|
||||
@@ -842,6 +855,13 @@ ifeq ($(OSNAME), AIX)
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), loongarch64)
|
||||
ifeq ($(CORE), LOONGSONG3R5)
|
||||
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef BINARY_DEFINED
|
||||
|
||||
17
Makefile.x86
17
Makefile.x86
@@ -1,10 +1,21 @@
|
||||
# COMPILER_PREFIX = mingw32-
|
||||
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
FCOMMON_OPT += -msse
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE
|
||||
CCOMMON_OPT += -msse
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
ARFLAGS = -m x86
|
||||
|
||||
@@ -8,6 +8,16 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH),1)
|
||||
ADD_CPUFLAGS = 1
|
||||
else
|
||||
ifdef TARGET_CORE
|
||||
ADD_CPUFLAGS = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef ADD_CPUFLAGS
|
||||
ifdef HAVE_SSE3
|
||||
CCOMMON_OPT += -msse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
@@ -44,7 +54,6 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
@@ -62,10 +71,8 @@ endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), COOPERLAKE)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
@@ -88,7 +95,6 @@ endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef HAVE_AVX2
|
||||
ifndef NO_AVX2
|
||||
@@ -120,6 +126,7 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(OSNAME), Interix)
|
||||
|
||||
@@ -27,7 +27,7 @@ We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases).
|
||||
|
||||
## Installation from Source
|
||||
|
||||
|
||||
@@ -92,6 +92,7 @@ CORTEXA57
|
||||
CORTEXA72
|
||||
CORTEXA73
|
||||
NEOVERSEN1
|
||||
CORTEXA55
|
||||
EMAG8180
|
||||
FALKOR
|
||||
THUNDERX
|
||||
@@ -109,3 +110,5 @@ Z14
|
||||
RISCV64_GENERIC
|
||||
C910V
|
||||
|
||||
11.LOONGARCH64:
|
||||
LOONGSON3R5
|
||||
|
||||
@@ -47,6 +47,7 @@ environment:
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||
|
||||
@@ -4,7 +4,15 @@ trigger:
|
||||
branches:
|
||||
include:
|
||||
- develop
|
||||
|
||||
resources:
|
||||
containers:
|
||||
- container: oneapi-hpckit
|
||||
image: intel/oneapi-hpckit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
- container: oneapi-basekit
|
||||
image: intel/oneapi-basekit:latest
|
||||
options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so'
|
||||
|
||||
jobs:
|
||||
# manylinux1 is useful to test because the
|
||||
# standard Docker container uses an old version
|
||||
@@ -74,7 +82,86 @@ jobs:
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
|
||||
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_GCC_Nothreads
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
- job: OSX_OpenMP_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10
|
||||
|
||||
- job: OSX_Ifort_Clang
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
variables:
|
||||
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
|
||||
LIBRARY_PATH: /usr/local/opt/llvm/lib
|
||||
MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install llvm libomp
|
||||
sudo mkdir -p /opt/intel
|
||||
sudo chown $USER /opt/intel
|
||||
displayName: prepare for cache restore
|
||||
- task: Cache@2
|
||||
inputs:
|
||||
path: /opt/intel/oneapi
|
||||
key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"'
|
||||
cacheHitVar: CACHE_RESTORED
|
||||
- script: |
|
||||
curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5
|
||||
hdiutil attach webimage.dmg
|
||||
sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=.
|
||||
installer_exit_code=$?
|
||||
hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet
|
||||
exit $installer_exit_code
|
||||
displayName: install
|
||||
condition: ne(variables.CACHE_RESTORED, 'true')
|
||||
- script: |
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
make CC=/usr/local/opt/llvm/bin/clang FC=ifort
|
||||
|
||||
- job: OSX_NDK_ARMV7
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
brew install --cask android-ndk
|
||||
export ANDROID_NDK_HOME=/usr/local/share/android-ndk
|
||||
make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/arm-linux-androideabi-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
steps:
|
||||
- script: |
|
||||
wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
|
||||
&& echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1
|
||||
alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
|
||||
sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64
|
||||
alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install
|
||||
alpine ls -l mytestdir/include
|
||||
alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c
|
||||
alpine echo "#include <openblas_config.h>" >>test_install.c
|
||||
alpine echo "int main(){" >> test_install.c
|
||||
alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c
|
||||
alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install
|
||||
|
||||
|
||||
@@ -72,13 +72,17 @@ int main(int argc, char *argv[]){
|
||||
FLOAT *a,*work;
|
||||
FLOAT wkopt[4];
|
||||
blasint *ipiv;
|
||||
blasint m, i, j, info,lwork;
|
||||
blasint m, i, j, l, info,lwork;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1,timeg;
|
||||
|
||||
char *p;
|
||||
char btest = 'I';
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -86,6 +90,9 @@ int main(int argc, char *argv[]){
|
||||
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step);
|
||||
|
||||
@@ -124,32 +131,41 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " SIZE FLops Time Lwork\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
for (l = 0; l < loops; l++) {
|
||||
|
||||
if (btest == 'F') begin();
|
||||
GETRF (&m, &m, a, &m, ipiv, &info);
|
||||
if (btest == 'F') {
|
||||
end();
|
||||
timeg += getsec();
|
||||
}
|
||||
if (info) {
|
||||
fprintf(stderr, "Matrix is not singular .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
begin();
|
||||
if (btest == 'I') begin();
|
||||
|
||||
lwork = -1;
|
||||
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
|
||||
|
||||
lwork = (blasint)wkopt[0];
|
||||
GETRI(&m, a, &m, ipiv, work, &lwork, &info);
|
||||
end();
|
||||
if (btest == 'I') end();
|
||||
|
||||
if (info) {
|
||||
fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
if (btest == 'I')
|
||||
timeg += getsec();
|
||||
|
||||
} // loops
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops : %10.2f Sec : %d\n",
|
||||
COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork);
|
||||
|
||||
@@ -72,17 +72,21 @@ int main(int argc, char *argv[]){
|
||||
FLOAT *a, *b;
|
||||
blasint *ipiv;
|
||||
|
||||
blasint m, i, j, info;
|
||||
blasint m, i, j, l, info;
|
||||
blasint unit = 1;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
FLOAT maxerr;
|
||||
|
||||
double time1, time2;
|
||||
double time1, time2, timeg1,timeg2;
|
||||
|
||||
char *p;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||
@@ -110,9 +114,9 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, " SIZE Residual Decompose Solve Total\n");
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
timeg1 = timeg2 = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
@@ -138,7 +142,7 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
timeg1 += getsec();
|
||||
|
||||
begin();
|
||||
|
||||
@@ -151,8 +155,10 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time2 = getsec();
|
||||
|
||||
timeg2 += getsec();
|
||||
} //loops
|
||||
time1=timeg1/(double)loops;
|
||||
time2=timeg2/(double)loops;
|
||||
maxerr = 0.;
|
||||
|
||||
for(i = 0; i < m; i++){
|
||||
|
||||
@@ -99,14 +99,15 @@ int main(int argc, char *argv[]){
|
||||
char *p;
|
||||
char btest = 'F';
|
||||
|
||||
blasint m, i, j, info, uplos=0;
|
||||
double flops;
|
||||
blasint m, i, j, l, info, uplos=0;
|
||||
double flops = 0.;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
double time1, timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -119,6 +120,8 @@ int main(int argc, char *argv[]){
|
||||
|
||||
if ((p = getenv("OPENBLAS_TEST"))) btest=*p;
|
||||
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]);
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||
@@ -129,19 +132,21 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
|
||||
for(m = from; m <= to; m += step){
|
||||
timeg=0.;
|
||||
for (l = 0; l < loops; l++) {
|
||||
#ifndef COMPLEX
|
||||
if (uplos & 1) {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
}
|
||||
} else {
|
||||
for (j = 0; j < m; j++) {
|
||||
for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.;
|
||||
for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.;
|
||||
}
|
||||
}
|
||||
@@ -192,8 +197,8 @@ int main(int argc, char *argv[]){
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'F')
|
||||
timeg += getsec();
|
||||
|
||||
if ( btest == 'S' )
|
||||
{
|
||||
@@ -214,9 +219,7 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, "Potrs info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
if ( btest == 'I' )
|
||||
@@ -232,11 +235,17 @@ int main(int argc, char *argv[]){
|
||||
fprintf(stderr, "Potri info = %d\n", info);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
time1 = getsec();
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
timeg += getsec();
|
||||
}
|
||||
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
if ( btest == 'F')
|
||||
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
if ( btest == 'S')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
|
||||
if ( btest == 'I')
|
||||
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
|
||||
fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest);
|
||||
|
||||
|
||||
|
||||
@@ -46,14 +46,17 @@ int main(int argc, char *argv[]){
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
blasint inc_x= 1;
|
||||
blasint inc_y= 1;
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -85,8 +88,9 @@ int main(int argc, char *argv[]){
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
|
||||
timeg = 0.;
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
for (l = 0; l < loops; l++) {
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
@@ -107,8 +111,10 @@ int main(int argc, char *argv[]){
|
||||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
timeg += getsec();
|
||||
} // loops
|
||||
|
||||
time1 = timeg/(double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
||||
@@ -56,17 +56,20 @@ int main(int argc, char *argv[]){
|
||||
|
||||
char uplo='U';
|
||||
char trans='N';
|
||||
|
||||
|
||||
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||
|
||||
blasint m, i, j;
|
||||
blasint m, i, j, l;
|
||||
|
||||
int from = 1;
|
||||
int to = 200;
|
||||
int step = 1;
|
||||
int loops = 1;
|
||||
|
||||
double time1;
|
||||
if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p;
|
||||
|
||||
double time1,timeg;
|
||||
|
||||
argc--;argv++;
|
||||
|
||||
@@ -95,9 +98,12 @@ int main(int argc, char *argv[]){
|
||||
|
||||
for(m = from; m <= to; m += step)
|
||||
{
|
||||
timeg = 0.;
|
||||
|
||||
fprintf(stderr, " %6d : ", (int)m);
|
||||
|
||||
for(l = 0; l < loops; l++) {
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < m * COMPSIZE; i++){
|
||||
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
@@ -111,8 +117,10 @@ int main(int argc, char *argv[]){
|
||||
|
||||
end();
|
||||
|
||||
time1 = getsec();
|
||||
|
||||
timeg += getsec();
|
||||
|
||||
} //loops
|
||||
time1 = timeg / (double)loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops\n",
|
||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
|
||||
|
||||
53
c_check
53
c_check
@@ -82,18 +82,19 @@ $os = Interix if ($data =~ /OS_INTERIX/);
|
||||
$os = Android if ($data =~ /OS_ANDROID/);
|
||||
$os = Haiku if ($data =~ /OS_HAIKU/);
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$defined = 0;
|
||||
|
||||
@@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($architecture eq "loongarch64") {
|
||||
$defined = 1;
|
||||
$binary = 64;
|
||||
}
|
||||
|
||||
if ($compiler eq "PGI") {
|
||||
$compiler_name .= " -tp p7" if ($binary eq "32");
|
||||
$compiler_name .= " -tp p7-64" if ($binary eq "64");
|
||||
@@ -215,17 +221,18 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = power if ($data =~ /ARCH_POWER/);
|
||||
$architecture = mips if ($data =~ /ARCH_MIPS/);
|
||||
$architecture = mips64 if ($data =~ /ARCH_MIPS64/);
|
||||
$architecture = alpha if ($data =~ /ARCH_ALPHA/);
|
||||
$architecture = sparc if ($data =~ /ARCH_SPARC/);
|
||||
$architecture = ia64 if ($data =~ /ARCH_IA64/);
|
||||
$architecture = arm if ($data =~ /ARCH_ARM/);
|
||||
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
|
||||
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
|
||||
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
@@ -44,7 +44,7 @@ endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
|
||||
@@ -124,9 +124,9 @@ if (NOT DYNAMIC_ARCH)
|
||||
if (HAVE_AVX)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mavx")
|
||||
endif ()
|
||||
if (HAVE_FMA3)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
endif ()
|
||||
# if (HAVE_FMA3)
|
||||
#set (CCOMMON_OPT "${CCOMMON_OPT} -mfma")
|
||||
#endif ()
|
||||
if (HAVE_SSE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -msse")
|
||||
endif ()
|
||||
|
||||
@@ -66,7 +66,7 @@ set(SLASRC
|
||||
slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f
|
||||
slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f
|
||||
slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f
|
||||
slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f
|
||||
slarrv.f slartv.f
|
||||
slarz.f slarzb.f slarzt.f slasy2.f
|
||||
slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f
|
||||
@@ -112,14 +112,14 @@ set(SLASRC
|
||||
sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f
|
||||
stpqrt.f stpqrt2.f stpmqrt.f stprfb.f
|
||||
sgelqt.f sgelqt3.f sgemlqt.f
|
||||
sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
|
||||
sgelq.f slaswlq.f slamswlq.f sgemlq.f
|
||||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
@@ -171,7 +171,7 @@ set(CLASRC
|
||||
claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f
|
||||
claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f
|
||||
claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f
|
||||
clarf.f clarfb.f clarfg.f clarfgp.f clarft.f
|
||||
clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f
|
||||
clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f
|
||||
clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f
|
||||
clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f
|
||||
@@ -209,14 +209,14 @@ set(CLASRC
|
||||
cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f
|
||||
ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f
|
||||
cgelqt.f cgelqt3.f cgemlqt.f
|
||||
cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
|
||||
cgelq.f claswlq.f clamswlq.f cgemlq.f
|
||||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
cungtsqr.f cungtsqr_row.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
@@ -253,7 +253,7 @@ set(DLASRC
|
||||
dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f
|
||||
dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f
|
||||
dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f
|
||||
dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f
|
||||
dlargv.f dlarrv.f dlartv.f
|
||||
dlarz.f dlarzb.f dlarzt.f dlasy2.f
|
||||
dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f
|
||||
@@ -300,14 +300,14 @@ set(DLASRC
|
||||
dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f
|
||||
dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f
|
||||
dgelqt.f dgelqt3.f dgemlqt.f
|
||||
dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
|
||||
dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
|
||||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
@@ -360,7 +360,7 @@ set(ZLASRC
|
||||
zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f
|
||||
zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f
|
||||
zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f
|
||||
zlarcm.f zlarf.f zlarfb.f
|
||||
zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f
|
||||
zlarfg.f zlarfgp.f zlarft.f
|
||||
zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f
|
||||
zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f
|
||||
@@ -402,13 +402,13 @@ set(ZLASRC
|
||||
ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f
|
||||
ztplqt.f ztplqt2.f ztpmlqt.f
|
||||
zgelqt.f zgelqt3.f zgemlqt.f
|
||||
zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
|
||||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
zungtsqr.f zungtsqr_row.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
||||
@@ -114,6 +114,8 @@ set(CSRC
|
||||
lapacke_cgetrs_work.c
|
||||
lapacke_cgetsls.c
|
||||
lapacke_cgetsls_work.c
|
||||
lapacke_cgetsqrhrt.c
|
||||
lapacke_cgetsqrhrt_work.c
|
||||
lapacke_cggbak.c
|
||||
lapacke_cggbak_work.c
|
||||
lapacke_cggbal.c
|
||||
@@ -590,6 +592,8 @@ set(CSRC
|
||||
lapacke_cungrq_work.c
|
||||
lapacke_cungtr.c
|
||||
lapacke_cungtr_work.c
|
||||
lapacke_cungtsqr_row.c
|
||||
lapacke_cungtsqr_row_work.c
|
||||
lapacke_cunmbr.c
|
||||
lapacke_cunmbr_work.c
|
||||
lapacke_cunmhr.c
|
||||
@@ -735,6 +739,8 @@ set(DSRC
|
||||
lapacke_dgetrs_work.c
|
||||
lapacke_dgetsls.c
|
||||
lapacke_dgetsls_work.c
|
||||
lapacke_dgetsqrhrt.c
|
||||
lapacke_dgetsqrhrt_work.c
|
||||
lapacke_dggbak.c
|
||||
lapacke_dggbak_work.c
|
||||
lapacke_dggbal.c
|
||||
@@ -862,6 +868,8 @@ set(DSRC
|
||||
lapacke_dorgrq_work.c
|
||||
lapacke_dorgtr.c
|
||||
lapacke_dorgtr_work.c
|
||||
lapacke_dorgtsqr_row.c
|
||||
lapacke_dorgtsqr_row_work.c
|
||||
lapacke_dormbr.c
|
||||
lapacke_dormbr_work.c
|
||||
lapacke_dormhr.c
|
||||
@@ -1309,6 +1317,8 @@ set(SSRC
|
||||
lapacke_sgetrs_work.c
|
||||
lapacke_sgetsls.c
|
||||
lapacke_sgetsls_work.c
|
||||
lapacke_sgetsqrhrt.c
|
||||
lapacke_sgetsqrhrt_work.c
|
||||
lapacke_sggbak.c
|
||||
lapacke_sggbak_work.c
|
||||
lapacke_sggbal.c
|
||||
@@ -1435,6 +1445,8 @@ set(SSRC
|
||||
lapacke_sorgrq_work.c
|
||||
lapacke_sorgtr.c
|
||||
lapacke_sorgtr_work.c
|
||||
lapacke_sorgtsqr_row.c
|
||||
lapacke_sorgtsqr_row_work.c
|
||||
lapacke_sormbr.c
|
||||
lapacke_sormbr_work.c
|
||||
lapacke_sormhr.c
|
||||
@@ -1877,6 +1889,8 @@ set(ZSRC
|
||||
lapacke_zgetrs_work.c
|
||||
lapacke_zgetsls.c
|
||||
lapacke_zgetsls_work.c
|
||||
lapacke_zgetsqrhrt.c
|
||||
lapacke_zgetsqrhrt_work.c
|
||||
lapacke_zggbak.c
|
||||
lapacke_zggbak_work.c
|
||||
lapacke_zggbal.c
|
||||
@@ -2351,6 +2365,8 @@ set(ZSRC
|
||||
lapacke_zungrq_work.c
|
||||
lapacke_zungtr.c
|
||||
lapacke_zungtr_work.c
|
||||
lapacke_zungtsqr_row.c
|
||||
lapacke_zungtsqr_row_work.c
|
||||
lapacke_zunmbr.c
|
||||
lapacke_zunmbr_work.c
|
||||
lapacke_zunmhr.c
|
||||
|
||||
@@ -177,7 +177,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53")
|
||||
elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_CODE_SIZE\t32768\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
|
||||
@@ -39,7 +39,7 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
set(TARGET "BARCELONA")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
|
||||
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55")
|
||||
set(TARGET "ARMV7")
|
||||
endif ()
|
||||
endif ()
|
||||
@@ -186,11 +186,11 @@ if (DEFINED TARGET)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if (NOT NO_AVX2)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
endif()
|
||||
endif()
|
||||
# if (DEFINED HAVE_FMA3)
|
||||
# if (NOT NO_AVX2)
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
|
||||
# endif()
|
||||
# endif()
|
||||
if (DEFINED HAVE_SSE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
|
||||
endif()
|
||||
@@ -299,6 +299,10 @@ if (NO_AVX2)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2")
|
||||
endif ()
|
||||
|
||||
if (NO_AVX512)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif ()
|
||||
|
||||
if (USE_THREAD)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
# NO_AFFINITY = 1
|
||||
|
||||
@@ -254,6 +254,19 @@ function(GenerateNamedObjects sources_in)
|
||||
# now add the object and set the defines
|
||||
set(obj_defines ${defines_in})
|
||||
|
||||
list(FIND obj_defines "RC" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "RC")
|
||||
list(APPEND obj_defines "RC=RC")
|
||||
endif ()
|
||||
list(FIND obj_defines "CR" def_idx)
|
||||
if (${def_idx} GREATER -1)
|
||||
# list(REMOVE_AT ${obj_defines} ${def_idx})
|
||||
list (REMOVE_ITEM obj_defines "CR")
|
||||
list(APPEND obj_defines "CR=CR")
|
||||
endif ()
|
||||
|
||||
if (use_cblas)
|
||||
set(obj_name "cblas_${obj_name}")
|
||||
list(APPEND obj_defines "CBLAS")
|
||||
@@ -298,7 +311,15 @@ function(GenerateNamedObjects sources_in)
|
||||
configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY)
|
||||
file(REMOVE ${new_source_file}.tmp)
|
||||
list(APPEND SRC_LIST_OUT ${new_source_file})
|
||||
|
||||
message (STATUS ${new_source_file})
|
||||
if (DEFINED HAVE_FMA3)
|
||||
if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
if ( ${new_source_file} MATCHES "dgemv_t_k.*c")
|
||||
set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma")
|
||||
endif ()
|
||||
endif ()
|
||||
endforeach ()
|
||||
endforeach ()
|
||||
|
||||
|
||||
15
common.h
15
common.h
@@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_alpha.h"
|
||||
#endif
|
||||
|
||||
#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include)
|
||||
#if __has_include(<cet.h>)
|
||||
#include <cet.h>
|
||||
#endif
|
||||
#endif
|
||||
#ifndef _CET_ENDBR
|
||||
#define _CET_ENDBR
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#include "common_x86.h"
|
||||
#endif
|
||||
@@ -440,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_mips.h"
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
#ifdef ARCH_RISCV64
|
||||
#include "common_riscv64.h"
|
||||
#endif
|
||||
@@ -461,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
|
||||
#include "common_zarch.h"
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_LOONGARCH64
|
||||
#include "common_loongarch64.h"
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#ifdef OS_WINDOWSSTORE
|
||||
typedef char env_var_t[MAX_PATH];
|
||||
|
||||
@@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *);
|
||||
int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *);
|
||||
int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
|
||||
|
||||
199
common_loongarch64.h
Normal file
199
common_loongarch64.h
Normal file
@@ -0,0 +1,199 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#ifndef COMMON_LOONGARCH64
|
||||
#define COMMON_LOONGARCH64
|
||||
|
||||
#define MB __sync_synchronize()
|
||||
#define WMB __sync_synchronize()
|
||||
#define RMB __sync_synchronize()
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
}
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
|
||||
#else
|
||||
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
|
||||
#endif
|
||||
|
||||
#define GET_IMAGE_CANCEL
|
||||
|
||||
#else
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define LD fld.d
|
||||
#define ST fst.d
|
||||
#define MADD fmadd.d
|
||||
#define NMADD fnmadd.d
|
||||
#define MSUB fmsub.d
|
||||
#define NMSUB fnmsub.d
|
||||
#define ADD fadd.d
|
||||
#define SUB fsub.d
|
||||
#define MUL fmul.d
|
||||
#define MOV fmov.d
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.d
|
||||
#define FABS fabs.d
|
||||
#define CMPEQ fcmp.ceq.d
|
||||
#define CMPLE fcmp.cle.d
|
||||
#define CMPLT fcmp.clt.d
|
||||
#define NEG fneg.d
|
||||
#else
|
||||
#define LD fld.s
|
||||
#define ST fst.s
|
||||
#define MADD fmadd.s
|
||||
#define NMADD fnmadd.s
|
||||
#define MSUB fmsub.s
|
||||
#define NMSUB fnmsub.s
|
||||
#define ADD fadd.s
|
||||
#define SUB fsub.s
|
||||
#define MUL fmul.s
|
||||
#define MOV fmov.s
|
||||
#define CMOVT fsel
|
||||
#define MTC movgr2fr.w
|
||||
#define FABS fabs.s
|
||||
#define CMPEQ fcmp.ceq.s
|
||||
#define CMPLE fcmp.cle.s
|
||||
#define CMPLT fcmp.clt.s
|
||||
#define NEG fneg.s
|
||||
#endif /* defined(DOUBLE) */
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
#define LDINT ld.d
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#elif defined(__64BIT__) && !defined(USE64BITINT)
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.d
|
||||
#define SDARG st.d
|
||||
#else
|
||||
#define LDINT ld.w
|
||||
#define LDARG ld.w
|
||||
#define SDARG st.w
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef F_INTERFACE
|
||||
#define REALNAME ASMNAME
|
||||
#else
|
||||
#define REALNAME ASMFNAME
|
||||
#endif /* defined(F_INTERFACE) */
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 5 ;\
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function ;\
|
||||
REALNAME: ;\
|
||||
|
||||
#if defined(__linux__) && defined(__ELF__)
|
||||
#define GNUSTACK .section .note.GNU-stack,"",@progbits
|
||||
#else
|
||||
#define GNUSTACK
|
||||
#endif /* defined(__linux__) && defined(__ELF__) */
|
||||
|
||||
#define EPILOGUE \
|
||||
.end REALNAME ;\
|
||||
GNUSTACK
|
||||
|
||||
#define PROFCODE
|
||||
|
||||
#define MOVT(dst, src, cc) \
|
||||
bceqz cc, 1f; \
|
||||
add.d dst, src, $r0; \
|
||||
1:
|
||||
|
||||
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
|
||||
|
||||
#endif /* defined(ASSEMBLER) */
|
||||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#define PAGESIZE (16UL << 1)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#define HUGE_PAGESIZE ( 2 << 20)
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
||||
#ifndef MAP_ANONYMOUS
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@@ -2490,7 +2490,8 @@
|
||||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|
||||
|| defined(ARCH_LOONGARCH64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sbgemm_p;
|
||||
|
||||
@@ -340,7 +340,8 @@ REALNAME:
|
||||
.align 16; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call mcount
|
||||
|
||||
@@ -451,7 +451,8 @@ REALNAME:
|
||||
.align 512; \
|
||||
.globl REALNAME ;\
|
||||
.type REALNAME, @function; \
|
||||
REALNAME:
|
||||
REALNAME: \
|
||||
_CET_ENDBR
|
||||
|
||||
#ifdef PROFILE
|
||||
#define PROFCODE call *mcount@GOTPCREL(%rip)
|
||||
|
||||
1
cpuid.h
1
cpuid.h
@@ -54,6 +54,7 @@
|
||||
#define VENDOR_TRANSMETA 9
|
||||
#define VENDOR_NSC 10
|
||||
#define VENDOR_HYGON 11
|
||||
#define VENDOR_ZHAOXIN 12
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
@@ -36,6 +36,7 @@ size_t length=sizeof(value);
|
||||
#define CPU_ARMV8 1
|
||||
// Arm
|
||||
#define CPU_CORTEXA53 2
|
||||
#define CPU_CORTEXA55 14
|
||||
#define CPU_CORTEXA57 3
|
||||
#define CPU_CORTEXA72 4
|
||||
#define CPU_CORTEXA73 5
|
||||
@@ -67,7 +68,8 @@ static char *cpuname[] = {
|
||||
"EMAG8180",
|
||||
"NEOVERSEN1",
|
||||
"THUNDERX3T110",
|
||||
"VORTEX"
|
||||
"VORTEX",
|
||||
"CORTEXA55"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
@@ -84,7 +86,8 @@ static char *cpuname_lower[] = {
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"vortex"
|
||||
"vortex",
|
||||
"cortexa55"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
@@ -161,6 +164,8 @@ int detect(void)
|
||||
return CPU_CORTEXA73;
|
||||
else if (strstr(cpu_part, "0xd0c"))
|
||||
return CPU_NEOVERSEN1;
|
||||
else if (strstr(cpu_part, "0xd05"))
|
||||
return CPU_CORTEXA55;
|
||||
}
|
||||
// Qualcomm
|
||||
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))
|
||||
@@ -281,6 +286,7 @@ void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
case CPU_CORTEXA53:
|
||||
case CPU_CORTEXA55:
|
||||
printf("#define %s\n", cpuname[d]);
|
||||
// Fall-through
|
||||
case CPU_ARMV8:
|
||||
|
||||
110
cpuid_loongarch64.c
Normal file
110
cpuid_loongarch64.c
Normal file
@@ -0,0 +1,110 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2011-2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_LOONGSON3R5 1
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"LOONGSON3R5"
|
||||
};
|
||||
|
||||
int detect(void) {
|
||||
uint32_t reg = 0;
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
|
||||
char *get_corename(void) {
|
||||
return cpuname[detect()];
|
||||
}
|
||||
|
||||
void get_architecture(void) {
|
||||
printf("LOONGARCH64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("LOONGSON3R5");
|
||||
} else {
|
||||
printf("UNKNOWN");
|
||||
}
|
||||
}
|
||||
|
||||
void get_subdirname(void) {
|
||||
printf("loongarch64");
|
||||
}
|
||||
|
||||
void get_cpuconfig(void) {
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
} else {
|
||||
printf("#define LOONGSON3R5\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
}
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if (detect() == CPU_LOONGSON3R5) {
|
||||
printf("loongson3r5\n");
|
||||
} else {
|
||||
printf("loongarch64\n");
|
||||
}
|
||||
}
|
||||
78
cpuid_x86.c
78
cpuid_x86.c
@@ -283,6 +283,7 @@ int get_vendor(void){
|
||||
if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX;
|
||||
if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN;
|
||||
if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE;
|
||||
if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS;
|
||||
if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA;
|
||||
@@ -1066,7 +1067,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||
|
||||
if ((get_vendor() == VENDOR_AMD) ||
|
||||
(get_vendor() == VENDOR_HYGON) ||
|
||||
(get_vendor() == VENDOR_CENTAUR)) {
|
||||
(get_vendor() == VENDOR_CENTAUR) ||
|
||||
(get_vendor() == VENDOR_ZHAOXIN)) {
|
||||
cpuid(0x80000005, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
LDTB.size = 4096;
|
||||
@@ -1189,7 +1191,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){
|
||||
|
||||
int get_cpuname(void){
|
||||
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
if (!have_cpuid()) return CPUTYPE_80386;
|
||||
|
||||
@@ -1197,6 +1199,7 @@ int get_cpuname(void){
|
||||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -1398,6 +1401,17 @@ int get_cpuname(void){
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 10: // Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return CPUTYPE_COOPERLAKE;
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
@@ -1616,13 +1630,20 @@ int get_cpuname(void){
|
||||
switch (family) {
|
||||
case 0x5:
|
||||
return CPUTYPE_CENTAURC6;
|
||||
break;
|
||||
case 0x6:
|
||||
return CPUTYPE_NANO;
|
||||
break;
|
||||
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CPUTYPE_NANO;
|
||||
return CPUTYPE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CPUTYPE_NEHALEM;
|
||||
else
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
return CPUTYPE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_RISE){
|
||||
@@ -1855,7 +1876,7 @@ char *get_lower_cpunamechar(void){
|
||||
|
||||
int get_coretype(void){
|
||||
|
||||
int family, exfamily, model, exmodel, vendor;
|
||||
int family, exfamily, model, exmodel, vendor, stepping;
|
||||
|
||||
if (!have_cpuid()) return CORE_80486;
|
||||
|
||||
@@ -1863,6 +1884,7 @@ int get_coretype(void){
|
||||
exfamily = get_cputype(GET_EXFAMILY);
|
||||
model = get_cputype(GET_MODEL);
|
||||
exmodel = get_cputype(GET_EXMODEL);
|
||||
stepping = get_cputype(GET_STEPPING);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -2112,7 +2134,22 @@ int get_coretype(void){
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
#endif
|
||||
if (model == 10)
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512_bf16())
|
||||
return CORE_COOPERLAKE;
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
break;
|
||||
case 7:
|
||||
if (model == 10)
|
||||
@@ -2135,13 +2172,13 @@ int get_coretype(void){
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
return CORE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
return CORE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
@@ -2257,10 +2294,19 @@ int get_coretype(void){
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return CORE_NANO;
|
||||
break;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return CORE_NANO;
|
||||
return CORE_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return CORE_NEHALEM;
|
||||
else
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
return CORE_VIAC3;
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
|
||||
return CORE_UNKNOWN;
|
||||
|
||||
4
ctest.c
4
ctest.c
@@ -157,6 +157,10 @@ ARCH_ARM64
|
||||
ARCH_RISCV64
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
ARCH_LOONGARCH64
|
||||
#endif
|
||||
|
||||
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
|
||||
HAVE_C11
|
||||
#endif
|
||||
|
||||
@@ -4,6 +4,9 @@ include_directories(${PROJECT_BINARY_DIR})
|
||||
enable_language(Fortran)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU)
|
||||
set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize")
|
||||
endif()
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
|
||||
@@ -6,6 +6,9 @@ TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
ifeq ($(F_COMPILER),GFORTRAN)
|
||||
override FFLAGS += -fno-tree-vectorize
|
||||
endif
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
||||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_COMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha,
|
||||
cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_COMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
@@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
||||
@@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
@@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( double* )malloc( LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) );
|
||||
AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
||||
@@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (double *)malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*k)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( double* )malloc( LDB*(*n)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*m)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*k)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*LDB*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
B = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( double* )malloc( LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( LDB*(*k)*sizeof( double ) );
|
||||
A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) );
|
||||
B = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( double* )malloc( (*n)*LDC*sizeof( double ) );
|
||||
C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
@@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( double* )malloc( (*m)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( double* )malloc( (*n)*LDA*sizeof( double ) );
|
||||
A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( double* )malloc( (*m)*LDB*sizeof( double ) );
|
||||
B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
||||
@@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
|
||||
for( i=0; i<*m; i++ ) {
|
||||
for( j=0; j<*n; j++ )
|
||||
@@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[ LDA*i+j ]=a[ (*lda)*j+i ];
|
||||
@@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
@@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n;
|
||||
A = ( float* )malloc( LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) );
|
||||
AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) );
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
for( i=0; i<j+1; i++, k++ )
|
||||
|
||||
@@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = (float *)malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*k)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B = ( float* )malloc( LDB*(*n)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*m)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*k)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*LDB*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
B = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A = ( float* )malloc( LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( LDB*(*k)*sizeof( float ) );
|
||||
A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) );
|
||||
B = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
@@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C = ( float* )malloc( (*n)*LDC*sizeof( float ) );
|
||||
C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
@@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A = ( float* )malloc( (*m)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A = ( float* )malloc( (*n)*LDA*sizeof( float ) );
|
||||
A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B = ( float* )malloc( (*m)*LDB*sizeof( float ) );
|
||||
B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
|
||||
@@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku,
|
||||
get_transpose_type(transp, &trans);
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *ku+*kl+2;
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*ku; i++ ){
|
||||
irow=*ku+*kl-i;
|
||||
jcol=(*ku)-i;
|
||||
@@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA;
|
||||
*incx, beta, y, *incy );
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
beta, y, *incy);
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
x, *incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
*incx);
|
||||
else {
|
||||
LDA = *k+2;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( i=0; i<*k; i++ ){
|
||||
irow=*k-i;
|
||||
@@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA=*n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[ LDA*i+j ].real=a[ (*lda)*j+i ].real;
|
||||
@@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha,
|
||||
cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)*
|
||||
A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
*incy, ap );
|
||||
else {
|
||||
LDA = *n;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)*
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)*
|
||||
sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
if (uplo == CblasUpper) {
|
||||
for( j=0, k=0; j<*n; j++ )
|
||||
@@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX ));
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
@@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha,
|
||||
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
LDA = *n+1;
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
|
||||
@@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (transa == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
else {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
|
||||
if (transb == CblasNoTrans) {
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
else {
|
||||
LDB = *k+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n,
|
||||
}
|
||||
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
B[i*LDB+j].imag=b[j*(*ldb)+i].imag;
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
A[i*LDA+j]=a[j*(*lda)+i];
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ )
|
||||
B[i*LDB+j]=b[j*(*ldb)+i];
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( j=0; j<*n; j++ )
|
||||
for( i=0; i<*m; i++ )
|
||||
C[i*LDC+j]=c[j*(*ldc)+i];
|
||||
@@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
if (trans == CblasNoTrans) {
|
||||
LDA = *k+1;
|
||||
LDB = *k+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*k; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
else {
|
||||
LDA = *n+1;
|
||||
LDB = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*k; i++ )
|
||||
for( j=0; j<*n; j++ ){
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k,
|
||||
}
|
||||
}
|
||||
LDC = *n+1;
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
C[i*LDC+j].real=c[j*(*ldc)+i].real;
|
||||
@@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
@@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
if (*order == TEST_ROW_MJR) {
|
||||
if (side == CblasLeft) {
|
||||
LDA = *m+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) );
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*m; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
else{
|
||||
LDA = *n+1;
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*n; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
A[i*LDA+j].real=a[j*(*lda)+i].real;
|
||||
@@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn,
|
||||
}
|
||||
}
|
||||
LDB = *n+1;
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX));
|
||||
for( i=0; i<*m; i++ )
|
||||
for( j=0; j<*n; j++ ) {
|
||||
B[i*LDB+j].real=b[j*(*ldb)+i].real;
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
#include "cblas_test.h"
|
||||
int CBLAS_CallFromC;
|
||||
int RowMajorStrg;
|
||||
|
||||
|
||||
@@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
@@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F)
|
||||
|
||||
@@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_ZHAOXIN 5
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
@@ -404,6 +405,7 @@ static int get_vendor(void){
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
@@ -414,7 +416,7 @@ static int get_vendor(void){
|
||||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
int family, exfamily, model, vendor, exmodel, stepping;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){
|
||||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
stepping = BITMASK(eax, 0, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
@@ -621,6 +624,22 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 10) {
|
||||
// Ice Lake SP
|
||||
if(support_avx512_bf16())
|
||||
return &gotoblas_COOPERLAKE;
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
@@ -807,10 +826,19 @@ static gotoblas_t *get_coretype(void){
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
if (model == 0xf && stepping < 0xe)
|
||||
return &gotoblas_NANO;
|
||||
return &gotoblas_NEHALEM;
|
||||
default:
|
||||
if (family >= 0x7)
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_ZHAOXIN) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -99,6 +99,11 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
#define gotoblas_CORTEXA55 gotoblas_ARMV8
|
||||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
@@ -111,11 +116,12 @@ extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 12
|
||||
#define NUM_CORETYPES 13
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
@@ -126,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__ ("mrs %0, "#id : "=r" (var)); \
|
||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
@@ -142,6 +148,7 @@ static char *corename[] = {
|
||||
"emag8180",
|
||||
"neoversen1",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
@@ -158,6 +165,7 @@ char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_EMAG8180) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[12];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
@@ -189,6 +197,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
case 9: return (&gotoblas_EMAG8180);
|
||||
case 10: return (&gotoblas_NEOVERSEN1);
|
||||
case 11: return (&gotoblas_THUNDERX3T110);
|
||||
case 12: return (&gotoblas_CORTEXA55);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
@@ -247,6 +256,8 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_CORTEXA73;
|
||||
case 0xd0c: // Neoverse N1
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
case 0xd05: // Cortex A55
|
||||
return &gotoblas_CORTEXA55;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
|
||||
@@ -428,7 +428,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -460,7 +460,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1291,7 +1291,12 @@ UNLOCK_COMMAND(&alloc_lock);
|
||||
return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
|
||||
|
||||
error:
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n");
|
||||
printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
|
||||
return NULL;
|
||||
}
|
||||
@@ -1702,7 +1707,6 @@ inline int atoi(const char *str) { return 0; }
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
@@ -1980,7 +1984,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1988,7 +1992,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -2012,7 +2016,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -2879,8 +2883,12 @@ void *blas_memory_alloc(int procpos){
|
||||
return (void *)memory[position].addr;
|
||||
|
||||
error:
|
||||
printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
|
||||
printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
|
||||
printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
|
||||
printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
|
||||
printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
|
||||
printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
|
||||
printf("cpu cores than what OpenBLAS was configured to handle.\n");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
||||
@@ -139,9 +139,13 @@ endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
ifeq ($(F_COMPILER), INTEL)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
$(CC) $(CFLAGS) -c -o $(@F) -s $<
|
||||
|
||||
18
f_check
18
f_check
@@ -314,11 +314,11 @@ if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
$link =~ s/\-R\s*/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
$link =~ s/\-rpath\s+/\-rpath\%/g;
|
||||
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\@/g;
|
||||
$link =~ s/\-rpath-link\s+/\-rpath-link\%/g;
|
||||
|
||||
@flags = split(/[\s\,\n]/, $link);
|
||||
# remove leading and trailing quotes from each flag.
|
||||
@@ -344,13 +344,13 @@ if ($link ne "") {
|
||||
}
|
||||
|
||||
|
||||
if ($flags =~ /^\-rpath\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($flags =~ /^\-rpath\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
|
||||
if ($flags =~ /^\-rpath-link\@/) {
|
||||
$flags =~ s/\@/\,/g;
|
||||
if ($flags =~ /^\-rpath-link\%/) {
|
||||
$flags =~ s/\%/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
@@ -391,10 +391,6 @@ if ($link ne "") {
|
||||
|
||||
}
|
||||
|
||||
if ($vendor eq "INTEL"){
|
||||
$linker_a .= "-lgfortran"
|
||||
}
|
||||
|
||||
if ($vendor eq "FLANG"){
|
||||
$linker_a .= "-lflang"
|
||||
}
|
||||
|
||||
39
getarch.c
39
getarch.c
@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_LOONGSON3R5 */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
@@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3R5
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "LOONGARCH"
|
||||
#define SUBARCHITECTURE "LOONGSON3R5"
|
||||
#define SUBDIRNAME "loongarch64"
|
||||
#define ARCHCONFIG "-DLOONGSON3R5 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
|
||||
#define LIBNAME "loongson3r5"
|
||||
#define CORENAME "LOONGSON3R5"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_I6400
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
@@ -1159,6 +1174,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA55
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "CORTEXA55"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA55 " \
|
||||
"-DL1_CODE_SIZE=16384 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=65536 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
|
||||
#define LIBNAME "cortexa55"
|
||||
#define CORENAME "CORTEXA55"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_FALKOR
|
||||
#define FORCE
|
||||
@@ -1373,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __loongarch64
|
||||
#include "cpuid_loongarch64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __riscv
|
||||
#include "cpuid_riscv64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
@@ -1448,7 +1483,7 @@ int main(int argc, char *argv[]){
|
||||
#ifdef FORCE
|
||||
printf("CORE=%s\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("CORE=%s\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
@@ -1596,7 +1631,7 @@ printf("ELF_VERSION=2\n");
|
||||
#ifdef FORCE
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
|
||||
#else
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
|
||||
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
|
||||
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -49,6 +49,8 @@
|
||||
#define ERROR_NAME "QGEMM "
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGEMM "
|
||||
#elif defined(BFLOAT16)
|
||||
#define ERROR_NAME "SBGEMM "
|
||||
#else
|
||||
#define ERROR_NAME "SGEMM "
|
||||
#endif
|
||||
@@ -124,6 +126,7 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
|
||||
#ifdef SMP
|
||||
double MNK;
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
@@ -142,6 +145,7 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
|
||||
int nodes;
|
||||
|
||||
@@ -201,7 +201,14 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
|
||||
#if 0
|
||||
/* this optimization causes stack corruption on x86_64 under OSX, Windows and FreeBSD */
|
||||
if (trans == 0 && incx == 1 && incy == 1 && m*n < 2304 *GEMM_MULTITHREAD_THRESHOLD) {
|
||||
GEMV_N(m, n, 0, alpha, a, lda, x, incx, y, incy, NULL);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -164,6 +164,11 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
if (m == 0 || n == 0) return;
|
||||
if (alpha == 0.) return;
|
||||
|
||||
if (incx == 1 && incy == 1 && 1L*m*n <= 2048 *GEMM_MULTITHREAD_THRESHOLD) {
|
||||
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, NULL);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -150,9 +150,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (*lda) * (*ldb) * sizeof(FLOAT);
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT);
|
||||
else
|
||||
msize = (*ldb) * (*ldb) * sizeof(FLOAT);
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT);
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
||||
@@ -95,7 +95,14 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.m*args.n < 40000)
|
||||
#else
|
||||
if (args.m*args.n < 10000)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
#ifndef DOUBLE
|
||||
if (args.n <128)
|
||||
#else
|
||||
if (args.n <64)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -121,6 +121,9 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
if (args.n < 180)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -95,7 +95,10 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
if (args.m*args.n <10000)
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -112,6 +112,13 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
#ifndef DOUBLE
|
||||
if (args.n < 64)
|
||||
#else
|
||||
if (args.n < 64)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -121,6 +121,15 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
||||
|
||||
#ifdef SMP
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
#ifndef DOUBLE
|
||||
if (args.n < 200)
|
||||
#else
|
||||
if (args.n < 150)
|
||||
#endif
|
||||
args.nthreads=1;
|
||||
else
|
||||
#endif
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -167,6 +167,26 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (incx == 1 && n <100) {
|
||||
blasint i;
|
||||
if (uplo==0) {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += i + 1;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += n - i;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
@@ -168,6 +168,24 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
|
||||
if (alpha == ZERO) return;
|
||||
|
||||
if (incx == 1 && incy == 1 && n < 50) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
|
||||
a += i + 1;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
|
||||
a += n - i;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -168,7 +168,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
#if 1
|
||||
if (incx == 1 && n < 100) {
|
||||
BLASLONG i;
|
||||
|
||||
if (uplo == 0) {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if (x[i] != ZERO) {
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], x + i, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += 1 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
@@ -170,6 +170,25 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
if (incx == 1 && incy == 1 && n < 100) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(i + 1, 0, 0, alpha * x[i], y, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(i + 1, 0, 0, alpha * y[i], x, 1, a, 1, NULL, 0);
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
AXPYU_K(n - i, 0, 0, alpha * x[i], y + i, 1, a, 1, NULL, 0);
|
||||
AXPYU_K(n - i, 0, 0, alpha * y[i], x + i, 1, a, 1, NULL, 0);
|
||||
a += 1 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
if (incx < 0 ) x -= (n - 1) * incx;
|
||||
|
||||
@@ -354,6 +354,17 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
|
||||
#endif
|
||||
|
||||
args.common = NULL;
|
||||
#ifndef COMPLEX
|
||||
#ifdef DOUBLE
|
||||
if (args.n < 100)
|
||||
#else
|
||||
if (args.n < 200)
|
||||
#endif
|
||||
#else
|
||||
if (args.n < 65)
|
||||
#endif
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -188,6 +188,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
|
||||
#endif
|
||||
|
||||
if ( *lda > *ldb )
|
||||
msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
else
|
||||
msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2;
|
||||
|
||||
b = malloc(msize);
|
||||
if ( b == NULL )
|
||||
|
||||
@@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
aa_i = fabs(da_r);
|
||||
}
|
||||
|
||||
scale = (aa_i / aa_r);
|
||||
ada = aa_r * sqrt(ONE + scale * scale);
|
||||
if (aa_r == ZERO) {
|
||||
ada = 0.;
|
||||
} else {
|
||||
scale = (aa_i / aa_r);
|
||||
ada = aa_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
|
||||
bb_r = fabs(db_r);
|
||||
bb_i = fabs(db_i);
|
||||
@@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){
|
||||
bb_i = fabs(bb_r);
|
||||
}
|
||||
|
||||
scale = (bb_i / bb_r);
|
||||
adb = bb_r * sqrt(ONE + scale * scale);
|
||||
|
||||
if (bb_r == ZERO) {
|
||||
adb = 0.;
|
||||
} else {
|
||||
scale = (bb_i / bb_r);
|
||||
adb = bb_r * sqrt(ONE + scale * scale);
|
||||
}
|
||||
scale = ada + adb;
|
||||
|
||||
aa_r = da_r / scale;
|
||||
|
||||
@@ -172,6 +172,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
|
||||
|
||||
if ((alpha_r == ZERO) && (alpha_i == ZERO)) return;
|
||||
|
||||
if (incx == 1 && n < 50) {
|
||||
blasint i;
|
||||
if (!uplo) {
|
||||
for (i = 0; i < n; i++){
|
||||
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
|
||||
AXPYU_K(i + 1, 0, 0,
|
||||
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
|
||||
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
|
||||
x, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += lda;
|
||||
}
|
||||
} else {
|
||||
for (i = 0; i < n; i++){
|
||||
if ((x[i * 2 + 0] != ZERO) || (x[i * 2 + 1] != ZERO)) {
|
||||
AXPYU_K(n - i, 0, 0,
|
||||
alpha_r * x[i * 2 + 0] - alpha_i * x[i * 2 + 1],
|
||||
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
|
||||
x + i * 2, 1, a, 1, NULL, 0);
|
||||
}
|
||||
a += 2 + lda;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -199,6 +199,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
|
||||
if (n == 0) return;
|
||||
|
||||
if (incx == 1 && trans == 0 && n < 50) {
|
||||
buffer = NULL;
|
||||
(trsv[(trans<<2) | (uplo<<1) | unit])(n, a, lda, x, incx, buffer);
|
||||
return;
|
||||
}
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
### GEMV ###
|
||||
|
||||
ifndef SGEMVNKERNEL
|
||||
@@ -263,7 +270,7 @@ $(KDIR)dgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_n$(TSUFFIX).$(PSUFFIX) : $(KER
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -UTRANS $< -o $@
|
||||
|
||||
$(KDIR)dgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)dgemv_t$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMVTKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -DDOUBLE -UCOMPLEX -DTRANS $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)qgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMVNKERNEL)
|
||||
|
||||
@@ -818,6 +818,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
@@ -828,6 +830,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
@@ -838,6 +842,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
@@ -848,6 +854,8 @@ ifeq ($(OS), AIX)
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
else ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
@@ -1044,6 +1052,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1054,6 +1064,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1064,6 +1076,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
@@ -1074,6 +1088,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
@@ -1084,6 +1100,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1094,6 +1112,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
@@ -1104,6 +1124,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
@@ -1114,6 +1136,8 @@ ifeq ($(OS), AIX)
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
else ifeq ($(CORE), SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
@@ -1187,29 +1211,55 @@ $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
|
||||
endif
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
|
||||
ifeq ($(CORE),SANDYBRIDGE)
|
||||
$(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
196
kernel/arm64/KERNEL.CORTEXA55
Normal file
196
kernel/arm64/KERNEL.CORTEXA55
Normal file
@@ -0,0 +1,196 @@
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SSCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
endif
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
|
||||
|
||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_M), 8)
|
||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
|
||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
|
||||
else
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
|
||||
endif
|
||||
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
|
||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
|
||||
else
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
|
||||
endif
|
||||
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
@@ -321,7 +321,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8", REGINF
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
@@ -270,11 +270,6 @@ All rights reserved.
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
add B04, B04, #8
|
||||
@@ -285,11 +280,6 @@ All rights reserved.
|
||||
ldr s5, [A06]
|
||||
ldr s6, [A07]
|
||||
ldr s7, [A08]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
|
||||
stp s4, s5, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
236
kernel/loongarch64/KERNEL
Normal file
236
kernel/loongarch64/KERNEL
Normal file
@@ -0,0 +1,236 @@
|
||||
ifndef SAXPYKERNEL
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
endif
|
||||
|
||||
ifndef DAXPYKERNEL
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
endif
|
||||
|
||||
ifndef CAXPYKERNEL
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
endif
|
||||
|
||||
ifndef ZAXPYKERNEL
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
endif
|
||||
|
||||
ifndef SROTKERNEL
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
endif
|
||||
|
||||
ifndef DROTKERNEL
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
endif
|
||||
|
||||
ifndef CROTKERNEL
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
ifndef ZROTKERNEL
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
ifndef CSWAPKERNEL
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
endif
|
||||
|
||||
ifndef ZSWAPKERNEL
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
endif
|
||||
|
||||
ifndef SSUMKERNEL
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
endif
|
||||
|
||||
ifndef DSUMKERNEL
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
endif
|
||||
|
||||
ifndef CSUMKERNEL
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
endif
|
||||
|
||||
ifndef ZSUMKERNEL
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
endif
|
||||
|
||||
ifndef ISMAXKERNEL
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
endif
|
||||
|
||||
ifndef IDMAXKERNEL
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
endif
|
||||
|
||||
ifndef ISMINKERNEL
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
endif
|
||||
|
||||
ifndef IDMINKERNEL
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
endif
|
||||
|
||||
ifndef SNRM2KERNEL
|
||||
SNRM2KERNEL = snrm2.S
|
||||
endif
|
||||
|
||||
ifndef DNRM2KERNEL
|
||||
DNRM2KERNEL = dnrm2.S
|
||||
endif
|
||||
|
||||
ifndef CNRM2KERNEL
|
||||
CNRM2KERNEL = cnrm2.S
|
||||
endif
|
||||
|
||||
ifndef ZNRM2KERNEL
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
ifndef SCABS_KERNEL
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef DCABS_KERNEL
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef QCABS_KERNEL
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
endif
|
||||
|
||||
ifndef LSAME_KERNEL
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
endif
|
||||
|
||||
ifndef SGEMMKERNEL
|
||||
SGEMMKERNEL = gemm_kernel.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef DGEMMKERNEL
|
||||
DGEMMKERNEL = gemm_kernel.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy.o
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef CGEMMKERNEL
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef ZGEMMKERNEL
|
||||
ZGEMMKERNEL = zgemm_kernel.S
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy.o
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
endif
|
||||
|
||||
ifndef SGEMM_BETA
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef DGEMM_BETA
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
endif
|
||||
ifndef CGEMM_BETA
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
ifndef ZGEMM_BETA
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_LN
|
||||
STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_LT
|
||||
STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_RN
|
||||
STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef STRSMKERNEL_RT
|
||||
STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_LN
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_LT
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_RN
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef DTRSMKERNEL_RT
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LN
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LT
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RN
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RT
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LN
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LT
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RN
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RT
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CGEMM3MKERNEL
|
||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
endif
|
||||
|
||||
ifndef ZGEMM3MKERNEL
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
endif
|
||||
1
kernel/loongarch64/KERNEL.LOONGSON3R5
Normal file
1
kernel/loongarch64/KERNEL.LOONGSON3R5
Normal file
@@ -0,0 +1 @@
|
||||
#TODO: Add loongarch64 SIMD optimizations
|
||||
167
kernel/loongarch64/KERNEL.generic
Normal file
167
kernel/loongarch64/KERNEL.generic
Normal file
@@ -0,0 +1,167 @@
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Pure C for other kernels
|
||||
SAMAXKERNEL = ../arm/amax.c
|
||||
DAMAXKERNEL = ../arm/amax.c
|
||||
CAMAXKERNEL = ../arm/zamax.c
|
||||
ZAMAXKERNEL = ../arm/zamax.c
|
||||
|
||||
SAMINKERNEL = ../arm/amin.c
|
||||
DAMINKERNEL = ../arm/amin.c
|
||||
CAMINKERNEL = ../arm/zamin.c
|
||||
ZAMINKERNEL = ../arm/zamin.c
|
||||
|
||||
SMAXKERNEL = ../arm/max.c
|
||||
DMAXKERNEL = ../arm/max.c
|
||||
|
||||
SMINKERNEL = ../arm/min.c
|
||||
DMINKERNEL = ../arm/min.c
|
||||
|
||||
ISAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = ../arm/iamax.c
|
||||
ICAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = ../arm/izamax.c
|
||||
|
||||
ISAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = ../arm/iamin.c
|
||||
ICAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = ../arm/izamin.c
|
||||
|
||||
ISMAXKERNEL = ../arm/imax.c
|
||||
IDMAXKERNEL = ../arm/imax.c
|
||||
|
||||
ISMINKERNEL = ../arm/imin.c
|
||||
IDMINKERNEL = ../arm/imin.c
|
||||
|
||||
SASUMKERNEL = ../arm/asum.c
|
||||
DASUMKERNEL = ../arm/asum.c
|
||||
CASUMKERNEL = ../arm/zasum.c
|
||||
ZASUMKERNEL = ../arm/zasum.c
|
||||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
CSUMKERNEL = ../arm/zsum.c
|
||||
ZSUMKERNEL = ../arm/zsum.c
|
||||
|
||||
|
||||
SAXPYKERNEL = ../arm/axpy.c
|
||||
DAXPYKERNEL = ../arm/axpy.c
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
|
||||
SCOPYKERNEL = ../arm/copy.c
|
||||
DCOPYKERNEL = ../arm/copy.c
|
||||
CCOPYKERNEL = ../arm/zcopy.c
|
||||
ZCOPYKERNEL = ../arm/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
SSCALKERNEL = ../arm/scal.c
|
||||
DSCALKERNEL = ../arm/scal.c
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
CSWAPKERNEL = ../arm/zswap.c
|
||||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
1
kernel/loongarch64/Makefile
Normal file
1
kernel/loongarch64/Makefile
Normal file
@@ -0,0 +1 @@
|
||||
clean ::
|
||||
230
kernel/loongarch64/amax.S
Normal file
230
kernel/loongarch64/amax.S
Normal file
@@ -0,0 +1,230 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
|
||||
FABS t1, a1
|
||||
|
||||
CMPLT $fcc0, s1, t1
|
||||
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
186
kernel/loongarch64/amin.S
Normal file
186
kernel/loongarch64/amin.S
Normal file
@@ -0,0 +1,186 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
|
||||
PROLOGUE
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
NOP
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
.align 3
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
232
kernel/loongarch64/asum.S
Normal file
232
kernel/loongarch64/asum.S
Normal file
@@ -0,0 +1,232 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f23
|
||||
#define a2 $f9
|
||||
#define a3 $f10
|
||||
#define a4 $f11
|
||||
#define a5 $f12
|
||||
#define a6 $f13
|
||||
#define a7 $f14
|
||||
#define a8 $f15
|
||||
#define t1 $f16
|
||||
#define t2 $f17
|
||||
#define t3 $f0
|
||||
#define t4 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
PROLOGUE
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
MTC s1, $r0
|
||||
MTC s2, $r0
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li TEMP, SIZE
|
||||
bge $r0, N, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
FABS t1, a1
|
||||
LD a6, X, 5 * SIZE
|
||||
FABS t2, a2
|
||||
LD a7, X, 6 * SIZE
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
ADD s1, s1, t1
|
||||
LD a1, X, 8 * SIZE
|
||||
FABS t1, a5
|
||||
addi.d I, I, -1
|
||||
ADD s2, s2, t2
|
||||
LD a2, X, 9 * SIZE
|
||||
FABS t2, a6
|
||||
NOP
|
||||
ADD s1, s1, t3
|
||||
LD a3, X, 10 * SIZE
|
||||
FABS t3, a7
|
||||
NOP
|
||||
ADD s2, s2, t4
|
||||
LD a4, X, 11 * SIZE
|
||||
FABS t4, a8
|
||||
addi.d X, X, 8 * SIZE
|
||||
ADD s1, s1, t1
|
||||
LD a5, X, 4 * SIZE
|
||||
FABS t1, a1
|
||||
NOP
|
||||
ADD s2, s2, t2
|
||||
LD a6, X, 5 * SIZE
|
||||
FABS t2, a2
|
||||
NOP
|
||||
ADD s1, s1, t3
|
||||
LD a7, X, 6 * SIZE
|
||||
FABS t3, a3
|
||||
NOP
|
||||
ADD s2, s2, t4
|
||||
LD a8, X, 7 * SIZE
|
||||
FABS t4, a4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
ADD s1, s1, t1
|
||||
addi.d X, X, 8 * SIZE
|
||||
FABS t1, a5
|
||||
NOP
|
||||
ADD s2, s2, t2
|
||||
FABS t2, a6
|
||||
ADD s1, s1, t3
|
||||
FABS t3, a7
|
||||
ADD s2, s2, t4
|
||||
FABS t4, a8
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
ADD s1, s1, t1
|
||||
addi.d X, X, SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
.L20:
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a8, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
.L23:
|
||||
ADD s1, s1, t1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t1, a5
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t2
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t3, a7
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t4
|
||||
LD a4, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t1, a1
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t2
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t3
|
||||
LD a7, X, 0 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
ADD s2, s2, t4
|
||||
LD a8, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
.L24:
|
||||
ADD s1, s1, t1
|
||||
FABS t1, a5
|
||||
ADD s2, s2, t2
|
||||
FABS t2, a6
|
||||
ADD s1, s1, t3
|
||||
FABS t3, a7
|
||||
ADD s2, s2, t4
|
||||
FABS t4, a8
|
||||
ADD s1, s1, t1
|
||||
ADD s2, s2, t2
|
||||
ADD s1, s1, t3
|
||||
ADD s2, s2, t4
|
||||
.align 3
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
add.d X, X, INCX
|
||||
ADD s1, s1, t1
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
.L999:
|
||||
ADD s1, s1, s2
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
159
kernel/loongarch64/cnrm2.S
Normal file
159
kernel/loongarch64/cnrm2.S
Normal file
@@ -0,0 +1,159 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f12
|
||||
#define a2 $f13
|
||||
#define a3 $f14
|
||||
#define a4 $f15
|
||||
#define a5 $f16
|
||||
#define a6 $f17
|
||||
#define a7 $f0
|
||||
#define a8 $f1
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define t1 $f23
|
||||
#define t2 $f9
|
||||
#define t3 $f10
|
||||
#define t4 $f11
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
movgr2fr.d s1, $r0
|
||||
li TEMP, 2 * SIZE
|
||||
fmov.d s2, s1
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
fcvt.d.s t1, a1
|
||||
LD a7, X, 0 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
LD a8, X, 1 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t4, a4
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L24
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a1, X, 0 * SIZE
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a2, X, 1 * SIZE
|
||||
fcvt.d.s t2, a6
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
LD a4, X, 1 * SIZE
|
||||
fcvt.d.s t4, a8
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t1, t1, s1
|
||||
LD a5, X, 0 * SIZE
|
||||
fcvt.d.s t1, a1
|
||||
addi.d I, I, -1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
LD a6, X, 1 * SIZE
|
||||
fcvt.d.s t2, a2
|
||||
add.d X, X, INCX
|
||||
fmadd.d s1, t3, t3, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
fcvt.d.s t3, a3
|
||||
LD a8, X, 1 * SIZE
|
||||
fmadd.d s2, t4, t4, s2
|
||||
add.d X, X, INCX
|
||||
fcvt.d.s t4, a4
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L24:
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fcvt.d.s t1, a5
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fcvt.d.s t2, a6
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fcvt.d.s t3, a7
|
||||
fmadd.d s2, t4, t4, s2
|
||||
fcvt.d.s t4, a8
|
||||
fmadd.d s1, t1, t1, s1
|
||||
fmadd.d s2, t2, t2, s2
|
||||
fmadd.d s1, t3, t3, s1
|
||||
fmadd.d s2, t4, t4, s2
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
fcvt.d.s t1, a1
|
||||
fcvt.d.s t2, a2
|
||||
fmadd.d s1, t1, t1, s1
|
||||
add.d X, X, INCX
|
||||
fmadd.d s2, t2, t2, s2
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
fadd.d s1, s1, s2
|
||||
fsqrt.d s1, s1
|
||||
move $r4, $r17
|
||||
fcvt.s.d $f0, s1
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
225
kernel/loongarch64/copy.S
Normal file
225
kernel/loongarch64/copy.S
Normal file
@@ -0,0 +1,225 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
li TEMP, SIZE
|
||||
NOP
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bne INCX, TEMP, .L20
|
||||
srai.d I, N, 3
|
||||
bne INCY, TEMP, .L20
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
LD a5, X, 4 * SIZE
|
||||
LD a6, X, 5 * SIZE
|
||||
LD a7, X, 6 * SIZE
|
||||
LD a8, X, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
ST a1, Y, 0 * SIZE
|
||||
LD a1, X, 8 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
LD a2, X, 9 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
LD a3, X, 10 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
LD a4, X, 11 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
LD a5, X, 12 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
LD a6, X, 13 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
LD a7, X, 14 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
LD a8, X, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
ST a1, Y, 0 * SIZE
|
||||
ST a2, Y, 1 * SIZE
|
||||
ST a3, Y, 2 * SIZE
|
||||
ST a4, Y, 3 * SIZE
|
||||
ST a5, Y, 4 * SIZE
|
||||
ST a6, Y, 5 * SIZE
|
||||
ST a7, Y, 6 * SIZE
|
||||
ST a8, Y, 7 * SIZE
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d X, X, SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d Y, Y, SIZE
|
||||
ST a1, Y, -1 * SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
srai.d I, N, 3
|
||||
addi.d I, I, -1
|
||||
blt I, $r0, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a8, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a5, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a6, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a7, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a8, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
addi.d I, I, -1
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
314
kernel/loongarch64/dnrm2.S
Normal file
314
kernel/loongarch64/dnrm2.S
Normal file
@@ -0,0 +1,314 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define XX $r7
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define ALPHA $f4
|
||||
#define max $f5
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
move XX, X
|
||||
NOP
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
add.d X, X, INCX
|
||||
FABS s1, a1
|
||||
FABS s2, a1
|
||||
bge $r0, N, .L999
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
addi.d I, I, -1
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
FABS t1, a5
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L100
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
FABS t1, a1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
add.d X, X, INCX
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L100:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
addi.d N, N, 1
|
||||
lu12i.w TEMP, 0x3f800
|
||||
movgr2fr.d a1, $r0
|
||||
movgr2fr.w ALPHA, TEMP
|
||||
CMPEQ $fcc0, s1, a1
|
||||
fcvt.d.s ALPHA, ALPHA
|
||||
bcnez $fcc0, .L999
|
||||
fdiv.d ALPHA, ALPHA, s1
|
||||
MOV max, s1
|
||||
MOV s1, a1
|
||||
MOV s2, a1
|
||||
MOV s3, a1
|
||||
MOV s4, a1
|
||||
srai.d I, N, 3
|
||||
bge $r0, I, .L105
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a5, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a6, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a7, XX, 0 * SIZE
|
||||
add.d XX, XX, INCX
|
||||
LD a8, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d XX, XX, INCX
|
||||
bge $r0, I, .L104
|
||||
.align 3
|
||||
|
||||
.L103:
|
||||
MUL t1, ALPHA, a1
|
||||
LD a1, XX, 0 * SIZE
|
||||
MUL t2, ALPHA, a2
|
||||
add.d XX, XX, INCX
|
||||
MUL t3, ALPHA, a3
|
||||
LD a2, XX, 0 * SIZE
|
||||
MUL t4, ALPHA, a4
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
LD a3, XX, 0 * SIZE
|
||||
MADD s2, t2, t2, s2
|
||||
add.d XX, XX, INCX
|
||||
MADD s3, t3, t3, s3
|
||||
LD a4, XX, 0 * SIZE
|
||||
MADD s4, t4, t4, s4
|
||||
add.d XX, XX, INCX
|
||||
MUL t1, ALPHA, a5
|
||||
LD a5, XX, 0 * SIZE
|
||||
MUL t2, ALPHA, a6
|
||||
add.d XX, XX, INCX
|
||||
MUL t3, ALPHA, a7
|
||||
LD a6, XX, 0 * SIZE
|
||||
MUL t4, ALPHA, a8
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
LD a7, XX, 0 * SIZE
|
||||
MADD s2, t2, t2, s2
|
||||
add.d XX, XX, INCX
|
||||
MADD s3, t3, t3, s3
|
||||
LD a8, XX, 0 * SIZE
|
||||
MADD s4, t4, t4, s4
|
||||
addi.d I, I, -1
|
||||
add.d XX, XX, INCX
|
||||
blt $r0, I, .L103
|
||||
.align 3
|
||||
|
||||
.L104:
|
||||
MUL t1, ALPHA, a1
|
||||
MUL t2, ALPHA, a2
|
||||
MUL t3, ALPHA, a3
|
||||
MUL t4, ALPHA, a4
|
||||
MADD s1, t1, t1, s1
|
||||
MADD s2, t2, t2, s2
|
||||
MADD s3, t3, t3, s3
|
||||
MADD s4, t4, t4, s4
|
||||
MUL t1, ALPHA, a5
|
||||
MUL t2, ALPHA, a6
|
||||
MUL t3, ALPHA, a7
|
||||
MUL t4, ALPHA, a8
|
||||
MADD s1, t1, t1, s1
|
||||
MADD s2, t2, t2, s2
|
||||
MADD s3, t3, t3, s3
|
||||
MADD s4, t4, t4, s4
|
||||
.align 3
|
||||
|
||||
.L105:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L106:
|
||||
LD a1, XX, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
MUL t1, ALPHA, a1
|
||||
add.d XX, XX, INCX
|
||||
MADD s1, t1, t1, s1
|
||||
blt $r0, I, .L106
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
ADD s1, s1, s2
|
||||
ADD s3, s3, s4
|
||||
ADD s1, s1, s3
|
||||
fsqrt.d s1, s1
|
||||
move $r4, $r17
|
||||
MUL $f0, max, s1
|
||||
jirl $r0, $r1, 0x0
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
391
kernel/loongarch64/dot.S
Normal file
391
kernel/loongarch64/dot.S
Normal file
@@ -0,0 +1,391 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define Y $r7
|
||||
#define INCY $r8
|
||||
#define I $r17
|
||||
#define TEMP $r18
|
||||
#define a1 $f23
|
||||
#define a2 $f9
|
||||
#define a3 $f10
|
||||
#define a4 $f11
|
||||
#define b1 $f12
|
||||
#define b2 $f13
|
||||
#define b3 $f14
|
||||
#define b4 $f15
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
LDINT INCY, 0(INCY)
|
||||
#endif
|
||||
|
||||
MTC s1, $r0
|
||||
MTC s2, $r0
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
li TEMP, SIZE
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bne INCY, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
LD b2, Y, 1 * SIZE
|
||||
LD a3, X, 2 * SIZE
|
||||
LD b3, Y, 2 * SIZE
|
||||
LD a4, X, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD b4, Y, 3 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 4 * SIZE
|
||||
LD b1, Y, 4 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 5 * SIZE
|
||||
LD b2, Y, 5 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 6 * SIZE
|
||||
LD b3, Y, 6 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 7 * SIZE
|
||||
LD b4, Y, 7 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 8 * SIZE
|
||||
LD b1, Y, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 9 * SIZE
|
||||
LD b2, Y, 9 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 10 * SIZE
|
||||
LD b3, Y, 10 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 11 * SIZE
|
||||
LD b4, Y, 11 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, 8 * SIZE
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
.L13:
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 4 * SIZE
|
||||
LD b1, Y, 4 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
LD a2, X, 5 * SIZE
|
||||
LD b2, Y, 5 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
LD a3, X, 6 * SIZE
|
||||
LD b3, Y, 6 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
LD a4, X, 7 * SIZE
|
||||
LD b4, Y, 7 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d X, X, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a2, a2
|
||||
fcvt.d.s b2, b2
|
||||
fmadd.d s2, b2, a2, s2
|
||||
#else
|
||||
MADD s2, b2, a2, s2
|
||||
#endif
|
||||
addi.d Y, Y, 8 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a3, a3
|
||||
fcvt.d.s b3, b3
|
||||
fmadd.d s1, b3, a3, s1
|
||||
#else
|
||||
MADD s1, b3, a3, s1
|
||||
#endif
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a4, a4
|
||||
fcvt.d.s b4, b4
|
||||
fmadd.d s2, b4, a4, s2
|
||||
#else
|
||||
MADD s2, b4, a4, s2
|
||||
#endif
|
||||
.align 3
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD b1, Y, 0 * SIZE
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
addi.d I, I, -1
|
||||
addi.d X, X, SIZE
|
||||
addi.d Y, Y, SIZE
|
||||
blt $r0, I, .L16
|
||||
b .L999
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
#ifdef F_INTERFACE
|
||||
bgez INCX, .L21
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCX
|
||||
mflo TEMP
|
||||
dsub X, X, TEMP
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
bgez INCY, .L22
|
||||
addi.d TEMP, N, -1
|
||||
mult TEMP, INCY
|
||||
mflo TEMP
|
||||
dsub Y, Y, TEMP
|
||||
.align 3
|
||||
|
||||
.L22:
|
||||
#endif
|
||||
bge $r0, I, .L25
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s2, b1, a1, s2
|
||||
#else
|
||||
MADD s2, b1, a1, s2
|
||||
#endif
|
||||
blt $r0, I, .L23
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD b1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
#ifdef DSDOT
|
||||
fcvt.d.s a1, a1
|
||||
fcvt.d.s b1, b1
|
||||
fmadd.d s1, b1, a1, s1
|
||||
#else
|
||||
MADD s1, b1, a1, s1
|
||||
#endif
|
||||
blt $r0, I, .L26
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
#ifdef DSDOT
|
||||
fadd.d $f0, s1, s2
|
||||
#else
|
||||
ADD $f0, s1, s2
|
||||
#endif
|
||||
move $r4, $r17
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
1859
kernel/loongarch64/gemm_kernel.S
Normal file
1859
kernel/loongarch64/gemm_kernel.S
Normal file
File diff suppressed because it is too large
Load Diff
531
kernel/loongarch64/gemv_n.S
Normal file
531
kernel/loongarch64/gemv_n.S
Normal file
@@ -0,0 +1,531 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Unused param dummy1 */
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r16
|
||||
#define YORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
#define ALPHA $f0
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define x1 $f14
|
||||
#define x2 $f15
|
||||
#define y1 $f16
|
||||
#define y2 $f17
|
||||
#define y3 $f3
|
||||
#define y4 $f1
|
||||
#define y5 $f2
|
||||
#define y6 $f4
|
||||
#define y7 $f5
|
||||
#define y8 $f6
|
||||
#define t1 $f7
|
||||
#define t2 $f18
|
||||
#define t3 $f19
|
||||
#define t4 $f20
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, -16
|
||||
#else
|
||||
addi.d $sp, $sp, -48
|
||||
#endif
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
slli.d LDA, LDA, BASE_SHIFT
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 16
|
||||
fst.d $f19, $sp, 24
|
||||
fst.d $f20, $sp, 32
|
||||
#endif
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li I, SIZE
|
||||
move YORIG, Y
|
||||
beq INCY, I, .L10
|
||||
srai.d I, M, 2
|
||||
move YORIG, BUFFER
|
||||
move XX, Y
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a2, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a3, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
LD a4, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
ST a3, YY, 2 * SIZE
|
||||
ST a4, YY, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, XX, 0 * SIZE
|
||||
add.d XX, XX, INCY
|
||||
ST a1, YY, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 1 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
LD x1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD x2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
move AO1, A
|
||||
add.d AO2, A, LDA
|
||||
add.d A, AO2, LDA
|
||||
move YY, YORIG
|
||||
MUL x1, ALPHA, x1
|
||||
srai.d I, M, 3
|
||||
MUL x2, ALPHA, x2
|
||||
bge $r0, I, .L15
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
LD y5, YY, 4 * SIZE
|
||||
LD a6, AO2, 1 * SIZE
|
||||
LD y6, YY, 5 * SIZE
|
||||
LD a7, AO2, 2 * SIZE
|
||||
LD y7, YY, 6 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD y8, YY, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
LD y1, YY, 8 * SIZE
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
LD y3, YY, 10 * SIZE
|
||||
LD y4, YY, 11 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 4 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 5 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
ST t3, YY, 2 * SIZE
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
LD y5, YY, 12 * SIZE
|
||||
LD y6, YY, 13 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
LD y7, YY, 14 * SIZE
|
||||
LD y8, YY, 15 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 8 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 9 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 10 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 11 * SIZE
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
MADD t1, a5, x2, t1
|
||||
LD a5, AO2, 4 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
LD a6, AO2, 5 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
LD a7, AO2, 6 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
MADD t1, a5, x2, t1
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD t2, a6, x2, t2
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD t3, a7, x2, t3
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
MADD t4, a8, x2, t4
|
||||
ST t1, YY, -4 * SIZE
|
||||
ST t2, YY, -3 * SIZE
|
||||
ST t3, YY, -2 * SIZE
|
||||
ST t4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L16
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a7, AO2, 2 * SIZE
|
||||
MADD y3, a3, x1, y3
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD y4, a4, x1, y4
|
||||
MADD y1, a5, x2, y1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD y2, a6, x2, y2
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD y3, a7, x2, y3
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
MADD y4, a8, x2, y4
|
||||
ST y1, YY, -4 * SIZE
|
||||
ST y2, YY, -3 * SIZE
|
||||
ST y3, YY, -2 * SIZE
|
||||
ST y4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L17
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
LD a6, AO2, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
MADD y1, a5, x2, y1
|
||||
addi.d AO1, AO1, 2 * SIZE
|
||||
MADD y2, a6, x2, y2
|
||||
addi.d AO2, AO2, 2 * SIZE
|
||||
ST y1, YY, -2 * SIZE
|
||||
ST y2, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L19
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a5, AO2, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y1, a5, x2, y1
|
||||
ST y1, YY, 0 * SIZE
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
addi.d J, J, -1
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
bge $r0, J, .L900
|
||||
.align 3
|
||||
|
||||
.L21:
|
||||
LD x1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
move YY, YORIG
|
||||
move AO1, A
|
||||
srai.d I, M, 3
|
||||
MUL x1, ALPHA, x1
|
||||
bge $r0, I, .L25
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
LD y5, YY, 4 * SIZE
|
||||
LD y6, YY, 5 * SIZE
|
||||
LD y7, YY, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD y8, YY, 7 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
LD y1, YY, 8 * SIZE
|
||||
LD y2, YY, 9 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
LD y3, YY, 10 * SIZE
|
||||
LD y4, YY, 11 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
ST t2, YY, 1 * SIZE
|
||||
ST t3, YY, 2 * SIZE
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
LD a2, AO1, 9 * SIZE
|
||||
LD y5, YY, 12 * SIZE
|
||||
LD y6, YY, 13 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
LD a3, AO1, 10 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
LD a4, AO1, 11 * SIZE
|
||||
LD y7, YY, 14 * SIZE
|
||||
LD y8, YY, 15 * SIZE
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD t1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD t2, a2, x1, y2
|
||||
LD a2, AO1, 5 * SIZE
|
||||
MADD t3, a3, x1, y3
|
||||
LD a3, AO1, 6 * SIZE
|
||||
MADD t4, a4, x1, y4
|
||||
LD a4, AO1, 7 * SIZE
|
||||
ST t1, YY, 0 * SIZE
|
||||
MADD t1, a1, x1, y5
|
||||
ST t2, YY, 1 * SIZE
|
||||
MADD t2, a2, x1, y6
|
||||
ST t3, YY, 2 * SIZE
|
||||
MADD t3, a3, x1, y7
|
||||
ST t4, YY, 3 * SIZE
|
||||
MADD t4, a4, x1, y8
|
||||
ST t1, YY, 4 * SIZE
|
||||
ST t2, YY, 5 * SIZE
|
||||
ST t3, YY, 6 * SIZE
|
||||
ST t4, YY, 7 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d YY, YY, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L26
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
LD a3, AO1, 2 * SIZE
|
||||
LD y3, YY, 2 * SIZE
|
||||
LD a4, AO1, 3 * SIZE
|
||||
LD y4, YY, 3 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
MADD y3, a3, x1, y3
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
MADD y4, a4, x1, y4
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
ST y1, YY, -4 * SIZE
|
||||
ST y2, YY, -3 * SIZE
|
||||
ST y3, YY, -2 * SIZE
|
||||
ST y4, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L26:
|
||||
andi I, M, 2
|
||||
bge $r0, I, .L27
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a2, AO1, 1 * SIZE
|
||||
LD y2, YY, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
addi.d YY, YY, 2 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
addi.d AO1, AO1, 2 * SIZE
|
||||
ST y1, YY, -2 * SIZE
|
||||
ST y2, YY, -1 * SIZE
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
andi I, M, 1
|
||||
bge $r0, I, .L900
|
||||
LD y1, YY, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
ST y1, YY, 0 * SIZE
|
||||
.align 3
|
||||
|
||||
.L900:
|
||||
li YORIG, SIZE
|
||||
srai.d I, M, 2
|
||||
beq INCY, YORIG, .L999
|
||||
move XX, BUFFER
|
||||
bge $r0, I, .L905
|
||||
.align 3
|
||||
|
||||
.L902:
|
||||
LD a1, XX, 0 * SIZE
|
||||
LD a2, XX, 1 * SIZE
|
||||
LD a3, XX, 2 * SIZE
|
||||
LD a4, XX, 3 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a3, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
ST a4, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
blt $r0, I, .L902
|
||||
.align 3
|
||||
|
||||
.L905:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L999
|
||||
.align 3
|
||||
|
||||
.L906:
|
||||
LD a1, XX, 0 * SIZE
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
ST a1, Y, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d Y, Y, INCY
|
||||
blt $r0, I, .L906
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 16
|
||||
fld.d $f19, $sp, 24
|
||||
fld.d $f20, $sp, 32
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 16
|
||||
#else
|
||||
addi.d $sp, $sp, 48
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
436
kernel/loongarch64/gemv_t.S
Normal file
436
kernel/loongarch64/gemv_t.S
Normal file
@@ -0,0 +1,436 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Unused param dummy1 */
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INCX $r10
|
||||
#define Y $r11
|
||||
#define INCY $r6
|
||||
#define BUFFER $r16
|
||||
#define XORIG $r18
|
||||
#define XX $r12
|
||||
#define YY $r13
|
||||
#define I $r14
|
||||
#define J $r15
|
||||
#define AO1 $r23
|
||||
#define AO2 $r24
|
||||
#define ALPHA $f0
|
||||
#define a1 $f22
|
||||
#define a2 $f8
|
||||
#define a3 $f23
|
||||
#define a4 $f9
|
||||
#define a5 $f10
|
||||
#define a6 $f11
|
||||
#define a7 $f12
|
||||
#define a8 $f13
|
||||
#define y1 $f14
|
||||
#define y2 $f15
|
||||
#define y3 $f16
|
||||
#define y4 $f17
|
||||
#define x1 $f3
|
||||
#define x2 $f1
|
||||
#define x3 $f2
|
||||
#define x4 $f4
|
||||
#define x5 $f5
|
||||
#define x6 $f6
|
||||
#define x7 $f7
|
||||
#define x8 $f18
|
||||
|
||||
PROLOGUE
|
||||
|
||||
LDARG INCY, $sp, 0
|
||||
LDARG BUFFER, $sp, 8
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, -16
|
||||
#else
|
||||
addi.d $sp, $sp, -32
|
||||
#endif
|
||||
MTC y1, $r0
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
slli.d LDA, LDA, BASE_SHIFT
|
||||
#ifndef __64BIT__
|
||||
fst.d $f18, $sp, 16
|
||||
#endif
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, M, .L999
|
||||
slli.d INCY, INCY, BASE_SHIFT
|
||||
bge $r0, N, .L999
|
||||
li I, SIZE
|
||||
move XORIG, X
|
||||
beq INCX, I, .L10
|
||||
srai.d I, M, 2
|
||||
move XORIG, BUFFER
|
||||
move YY, BUFFER
|
||||
bge $r0, I, .L05
|
||||
.align 3
|
||||
|
||||
.L02:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, YY, 0 * SIZE
|
||||
ST a2, YY, 1 * SIZE
|
||||
ST a3, YY, 2 * SIZE
|
||||
ST a4, YY, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 4 * SIZE
|
||||
blt $r0, I, .L02
|
||||
.align 3
|
||||
|
||||
.L05:
|
||||
andi I, M, 3
|
||||
bge $r0, I, .L10
|
||||
.align 3
|
||||
|
||||
.L06:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
ST a1, YY, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d YY, YY, 1 * SIZE
|
||||
blt $r0, I, .L06
|
||||
.align 3
|
||||
|
||||
.L10:
|
||||
srai.d J, N, 1
|
||||
move YY, Y
|
||||
bge $r0, J, .L20
|
||||
.align 3
|
||||
|
||||
.L11:
|
||||
move AO1, A
|
||||
MOV y2, y1
|
||||
add.d AO2, A, LDA
|
||||
MOV y3, y1
|
||||
add.d A, AO2, LDA
|
||||
MOV y4, y1
|
||||
srai.d I, M, 3
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L15
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x5, XX, 4 * SIZE
|
||||
LD a6, AO2, 2 * SIZE
|
||||
LD x6, XX, 5 * SIZE
|
||||
LD a7, AO1, 3 * SIZE
|
||||
LD x7, XX, 6 * SIZE
|
||||
LD a8, AO2, 3 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD x8, XX, 7 * SIZE
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
.L12:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a2, AO2, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
LD x1, XX, 8 * SIZE
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y2, a6, x3, y2
|
||||
LD a6, AO2, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
LD x3, XX, 10 * SIZE
|
||||
LD x4, XX, 11 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD y2, a2, x5, y2
|
||||
LD a2, AO2, 8 * SIZE
|
||||
MADD y3, a3, x6, y3
|
||||
LD a3, AO1, 9 * SIZE
|
||||
MADD y4, a4, x6, y4
|
||||
LD a4, AO2, 9 * SIZE
|
||||
LD x5, XX, 12 * SIZE
|
||||
LD x6, XX, 13 * SIZE
|
||||
MADD y1, a5, x7, y1
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD y2, a6, x7, y2
|
||||
LD a6, AO2, 10 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
LD a7, AO1, 11 * SIZE
|
||||
MADD y4, a8, x8, y4
|
||||
LD a8, AO2, 11 * SIZE
|
||||
LD x7, XX, 14 * SIZE
|
||||
LD x8, XX, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a2, AO2, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a4, AO2, 5 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y2, a6, x3, y2
|
||||
LD a6, AO2, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
LD a8, AO2, 7 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
MADD y2, a2, x5, y2
|
||||
MADD y3, a3, x6, y3
|
||||
MADD y4, a4, x6, y4
|
||||
MADD y1, a5, x7, y1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
MADD y2, a6, x7, y2
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
addi.d AO2, AO2, 8 * SIZE
|
||||
MADD y4, a8, x8, y4
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L17
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a4, AO2, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a6, AO2, 2 * SIZE
|
||||
MADD y2, a2, x1, y2
|
||||
LD a7, AO1, 3 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD y4, a4, x2, y4
|
||||
LD a8, AO2, 3 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
MADD y2, a6, x3, y2
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
MADD y4, a8, x4, y4
|
||||
addi.d AO2, AO2, 4 * SIZE
|
||||
.align 3
|
||||
|
||||
.L17:
|
||||
andi I, M, 3
|
||||
ADD y1, y1, y3
|
||||
ADD y2, y2, y4
|
||||
bge $r0, I, .L19
|
||||
.align 3
|
||||
.L18:
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD a2, AO2, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
addi.d AO1, AO1, 1 * SIZE
|
||||
addi.d AO2, AO2, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
MADD y2, a2, x1, y2
|
||||
blt $r0, I, .L18
|
||||
.align 3
|
||||
|
||||
.L19:
|
||||
LD a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
LD a2, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
MADD a1, y1, ALPHA, a1
|
||||
addi.d J, J, -1
|
||||
MADD a2, y2, ALPHA, a2
|
||||
MTC y1, $r0
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
ST a2, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
blt $r0, J, .L11
|
||||
.align 3
|
||||
|
||||
.L20:
|
||||
andi J, N, 1
|
||||
MOV y3, y1
|
||||
move AO1, A
|
||||
bge $r0, J, .L999
|
||||
srai.d I, M, 3
|
||||
move XX, XORIG
|
||||
bge $r0, I, .L25
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
LD a7, AO1, 3 * SIZE
|
||||
LD x4, XX, 3 * SIZE
|
||||
LD x5, XX, 4 * SIZE
|
||||
LD x6, XX, 5 * SIZE
|
||||
LD x7, XX, 6 * SIZE
|
||||
addi.d I, I, -1
|
||||
LD x8, XX, 7 * SIZE
|
||||
bge $r0, I, .L23
|
||||
.align 3
|
||||
.L22:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
LD x1, XX, 8 * SIZE
|
||||
LD x2, XX, 9 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
LD x3, XX, 10 * SIZE
|
||||
LD x4, XX, 11 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
LD a1, AO1, 8 * SIZE
|
||||
MADD y3, a3, x6, y3
|
||||
LD a3, AO1, 9 * SIZE
|
||||
LD x5, XX, 12 * SIZE
|
||||
LD x6, XX, 13 * SIZE
|
||||
MADD y1, a5, x7, y1
|
||||
LD a5, AO1, 10 * SIZE
|
||||
MADD y3, a7, x8, y3
|
||||
LD a7, AO1, 11 * SIZE
|
||||
LD x7, XX, 14 * SIZE
|
||||
LD x8, XX, 15 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
blt $r0, I, .L22
|
||||
.align 3
|
||||
|
||||
.L23:
|
||||
MADD y1, a1, x1, y1
|
||||
LD a1, AO1, 4 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD a3, AO1, 5 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
LD a5, AO1, 6 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
LD a7, AO1, 7 * SIZE
|
||||
MADD y1, a1, x5, y1
|
||||
MADD y3, a3, x6, y3
|
||||
MADD y1, a5, x7, y1
|
||||
MADD y3, a7, x8, y3
|
||||
addi.d XX, XX, 8 * SIZE
|
||||
addi.d AO1, AO1, 8 * SIZE
|
||||
.align 3
|
||||
|
||||
.L25:
|
||||
andi I, M, 4
|
||||
bge $r0, I, .L27
|
||||
LD a1, AO1, 0 * SIZE
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a3, AO1, 1 * SIZE
|
||||
LD x2, XX, 1 * SIZE
|
||||
LD a5, AO1, 2 * SIZE
|
||||
LD x3, XX, 2 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
LD a7, AO1, 3 * SIZE
|
||||
MADD y3, a3, x2, y3
|
||||
LD x4, XX, 3 * SIZE
|
||||
MADD y1, a5, x3, y1
|
||||
addi.d XX, XX, 4 * SIZE
|
||||
MADD y3, a7, x4, y3
|
||||
addi.d AO1, AO1, 4 * SIZE
|
||||
.align 3
|
||||
|
||||
.L27:
|
||||
andi I, M, 3
|
||||
ADD y1, y1, y3
|
||||
bge $r0, I, .L29
|
||||
.align 3
|
||||
.L28:
|
||||
LD x1, XX, 0 * SIZE
|
||||
LD a1, AO1, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
addi.d XX, XX, 1 * SIZE
|
||||
addi.d AO1, AO1, 1 * SIZE
|
||||
MADD y1, a1, x1, y1
|
||||
blt $r0, I, .L28
|
||||
.align 3
|
||||
|
||||
.L29:
|
||||
LD a1, Y, 0 * SIZE
|
||||
add.d Y, Y, INCY
|
||||
MADD a1, y1, ALPHA, a1
|
||||
ST a1, YY, 0 * SIZE
|
||||
add.d YY, YY, INCY
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
#ifndef __64BIT__
|
||||
fld.d $f18, $sp, 16
|
||||
#endif
|
||||
#ifdef __64BIT__
|
||||
addi.d $sp, $sp, 16
|
||||
#else
|
||||
addi.d $sp, $sp, 32
|
||||
#endif
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
233
kernel/loongarch64/iamax.S
Normal file
233
kernel/loongarch64/iamax.S
Normal file
@@ -0,0 +1,233 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
li x1, 1
|
||||
bge $r0, N, .L999
|
||||
FABS s1, a1
|
||||
add.d X, X, INCX
|
||||
FABS s2, a1
|
||||
li x2, 1
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
li x3, 1
|
||||
li TEMP, 2
|
||||
li x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d I, I, -1
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, s3, t3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, s4, t4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
FABS t1, a5
|
||||
addi.d TEMP, TEMP, 4
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t2
|
||||
CMPLT $fcc2, s3, t3
|
||||
CMPLT $fcc3, s4, t4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
233
kernel/loongarch64/iamin.S
Normal file
233
kernel/loongarch64/iamin.S
Normal file
@@ -0,0 +1,233 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
addi.d N, N, -1
|
||||
li x1, 1
|
||||
bge $r0, N, .L999
|
||||
FABS s1, a1
|
||||
add.d X, X, INCX
|
||||
FABS s2, a1
|
||||
li x2, 1
|
||||
FABS s3, a1
|
||||
srai.d I, N, 3
|
||||
FABS s4, a1
|
||||
li x3, 1
|
||||
li TEMP, 2
|
||||
li x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a2, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a4, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a6, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a8, X, 0 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
add.d X, X, INCX
|
||||
FABS t3, a3
|
||||
LD a2, X, 0 * SIZE
|
||||
FABS t4, a4
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a3, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a4, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d I, I, -1
|
||||
FABS t1, a5
|
||||
LD a5, X, 0 * SIZE
|
||||
FABS t2, a6
|
||||
add.d X, X, INCX
|
||||
FABS t3, a7
|
||||
LD a6, X, 0 * SIZE
|
||||
FABS t4, a8
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t2, s2
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc2, t3, s3
|
||||
LD a8, X, 0 * SIZE
|
||||
CMPLT $fcc3, t4, s4
|
||||
add.d X, X, INCX
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
FABS t1, a5
|
||||
addi.d TEMP, TEMP, 4
|
||||
FABS t2, a6
|
||||
FABS t3, a7
|
||||
FABS t4, a8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t2, s2
|
||||
CMPLT $fcc2, t3, s3
|
||||
CMPLT $fcc3, t4, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t2, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t3, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t4, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 7
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
217
kernel/loongarch64/izamax.S
Normal file
217
kernel/loongarch64/izamax.S
Normal file
@@ -0,0 +1,217 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
ADD s2, t1, t2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
addi.d N, N, -1
|
||||
li x1, 1
|
||||
bge $r0, N, .L999
|
||||
add.d X, X, INCX
|
||||
li x2, 1
|
||||
srai.d I, N, 2
|
||||
li x3, 1
|
||||
li TEMP, 2
|
||||
li x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, s2, t3
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, s3, t5
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, s4, t7
|
||||
addi.d I, I, -1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMPLT $fcc1, s2, t3
|
||||
CMPLT $fcc2, s3, t5
|
||||
CMPLT $fcc3, s4, t7
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, s1, t1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s1, s2
|
||||
CMPLT $fcc1, s3, s4
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s1, s3
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
217
kernel/loongarch64/izamin.S
Normal file
217
kernel/loongarch64/izamin.S
Normal file
@@ -0,0 +1,217 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define N $r4
|
||||
#define X $r5
|
||||
#define INCX $r6
|
||||
#define I $r18
|
||||
#define TEMP $r7
|
||||
#define a1 $f10
|
||||
#define a2 $f11
|
||||
#define a3 $f12
|
||||
#define a4 $f13
|
||||
#define a5 $f14
|
||||
#define a6 $f15
|
||||
#define a7 $f16
|
||||
#define a8 $f17
|
||||
#define t1 $f0
|
||||
#define t2 $f1
|
||||
#define t3 $f2
|
||||
#define t4 $f3
|
||||
#define t5 $f4
|
||||
#define t6 $f5
|
||||
#define t7 $f6
|
||||
#define t8 $f7
|
||||
#define s1 $f22
|
||||
#define s2 $f8
|
||||
#define s3 $f23
|
||||
#define s4 $f9
|
||||
#define x1 $r17
|
||||
#define x2 $r8
|
||||
#define x3 $r9
|
||||
#define x4 $r10
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#ifdef F_INTERFACE
|
||||
LDINT N, 0(N)
|
||||
LDINT INCX, 0(INCX)
|
||||
#endif
|
||||
|
||||
li x1, 0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD s1, t1, t2
|
||||
ADD s2, t1, t2
|
||||
ADD s3, t1, t2
|
||||
ADD s4, t1, t2
|
||||
addi.d N, N, -1
|
||||
li x1, 1
|
||||
bge $r0, N, .L999
|
||||
add.d X, X, INCX
|
||||
li x2, 1
|
||||
srai.d I, N, 2
|
||||
li x3, 1
|
||||
li TEMP, 2
|
||||
li x4, 1
|
||||
bge $r0, I, .L15
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a3, X, 0 * SIZE
|
||||
LD a4, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a5, X, 0 * SIZE
|
||||
LD a6, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
LD a7, X, 0 * SIZE
|
||||
LD a8, X, 1 * SIZE
|
||||
addi.d I, I, -1
|
||||
add.d X, X, INCX
|
||||
bge $r0, I, .L13
|
||||
.align 3
|
||||
|
||||
.L12:
|
||||
FABS t1, a1
|
||||
LD a1, X, 0 * SIZE
|
||||
FABS t2, a2
|
||||
LD a2, X, 1 * SIZE
|
||||
FABS t3, a3
|
||||
add.d X, X, INCX
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
LD a3, X, 0 * SIZE
|
||||
FABS t6, a6
|
||||
LD a4, X, 1 * SIZE
|
||||
FABS t7, a7
|
||||
add.d X, X, INCX
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
LD a5, X, 0 * SIZE
|
||||
ADD t3, t3, t4
|
||||
LD a6, X, 1 * SIZE
|
||||
ADD t5, t5, t6
|
||||
add.d X, X, INCX
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, t1, s1
|
||||
LD a7, X, 0 * SIZE
|
||||
CMPLT $fcc1, t3, s2
|
||||
LD a8, X, 1 * SIZE
|
||||
CMPLT $fcc2, t5, s3
|
||||
add.d X, X, INCX
|
||||
CMPLT $fcc3, t7, s4
|
||||
addi.d I, I, -1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
blt $r0, I, .L12
|
||||
.align 3
|
||||
|
||||
.L13:
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
FABS t3, a3
|
||||
FABS t4, a4
|
||||
FABS t5, a5
|
||||
FABS t6, a6
|
||||
FABS t7, a7
|
||||
FABS t8, a8
|
||||
ADD t1, t1, t2
|
||||
ADD t3, t3, t4
|
||||
ADD t5, t5, t6
|
||||
ADD t7, t7, t8
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMPLT $fcc1, t3, s2
|
||||
CMPLT $fcc2, t5, s3
|
||||
CMPLT $fcc3, t7, s4
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
CMOVT s2, s2, t3, $fcc1
|
||||
MOVT(x2, TEMP, $fcc1)
|
||||
CMOVT s3, s3, t5, $fcc2
|
||||
MOVT(x3, TEMP, $fcc2)
|
||||
CMOVT s4, s4, t7, $fcc3
|
||||
MOVT(x4, TEMP, $fcc3)
|
||||
addi.d TEMP, TEMP, 4
|
||||
addi.d x2, x2, 1
|
||||
addi.d x3, x3, 2
|
||||
addi.d x4, x4, 3
|
||||
.align 3
|
||||
|
||||
.L15:
|
||||
andi I, N, 3
|
||||
bge $r0, I, .L998
|
||||
.align 3
|
||||
|
||||
.L16:
|
||||
LD a1, X, 0 * SIZE
|
||||
LD a2, X, 1 * SIZE
|
||||
add.d X, X, INCX
|
||||
FABS t1, a1
|
||||
FABS t2, a2
|
||||
ADD t1, t1, t2
|
||||
addi.d I, I, -1
|
||||
CMPLT $fcc0, t1, s1
|
||||
CMOVT s1, s1, t1, $fcc0
|
||||
MOVT(x1, TEMP, $fcc0)
|
||||
addi.d TEMP, TEMP, 1
|
||||
blt $r0, I, .L16
|
||||
.align 3
|
||||
|
||||
.L998:
|
||||
CMPLT $fcc0, s2, s1
|
||||
CMPLT $fcc1, s4, s3
|
||||
CMOVT s1, s1, s2, $fcc0
|
||||
MOVT(x1, x2, $fcc0)
|
||||
CMOVT s3, s3, s4, $fcc1
|
||||
MOVT(x3, x4, $fcc1)
|
||||
CMPLT $fcc0, s3, s1
|
||||
CMOVT s1, s1, s3, $fcc0
|
||||
MOVT(x1, x3, $fcc0)
|
||||
.align 3
|
||||
|
||||
.L999:
|
||||
move $r4, $r17
|
||||
fmov.d $f0, $f22
|
||||
jirl $r0, $r1, 0x0
|
||||
|
||||
EPILOGUE
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user