Compare commits
273 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d57c681a6d | ||
|
|
d7efe5857c | ||
|
|
8fd694c18f | ||
|
|
e69b0b1771 | ||
|
|
9dc0bfd617 | ||
|
|
e6664ec2c9 | ||
|
|
dbb33f412f | ||
|
|
70b89a6205 | ||
|
|
07b144855a | ||
|
|
292a0aed66 | ||
|
|
42f0201e21 | ||
|
|
22db876d48 | ||
|
|
bdd6e3a153 | ||
|
|
7b8f580941 | ||
|
|
198adea961 | ||
|
|
86c5a0013f | ||
|
|
ef85c22474 | ||
|
|
d3555d2e50 | ||
|
|
c4b91bfcf1 | ||
|
|
0f5e86a0d9 | ||
|
|
7b294a99fd | ||
|
|
1e4b2e98d9 | ||
|
|
3fd6ccdf76 | ||
|
|
fa9a30b491 | ||
|
|
d90ca75a6c | ||
|
|
e107454454 | ||
|
|
d43962d013 | ||
|
|
2f6d35c3d4 | ||
|
|
86de5f768b | ||
|
|
2663e44724 | ||
|
|
6f2900c164 | ||
|
|
7888b5127c | ||
|
|
8808c291b9 | ||
|
|
8cdf0825de | ||
|
|
9e0dbe8e59 | ||
|
|
52f99d3944 | ||
|
|
186368ddc3 | ||
|
|
c0b94ae1df | ||
|
|
ddd86309a1 | ||
|
|
e9d453b623 | ||
|
|
ecb4babcf4 | ||
|
|
34753eaebb | ||
|
|
efa72a631b | ||
|
|
30d835168a | ||
|
|
8f6a744807 | ||
|
|
6726771645 | ||
|
|
a51cae6b2e | ||
|
|
d30b943251 | ||
|
|
0934568d9c | ||
|
|
697e64bbb6 | ||
|
|
bffb9b0e95 | ||
|
|
6ae7af78a3 | ||
|
|
041a26fd79 | ||
|
|
3c356b1a1f | ||
|
|
b1215f2f8c | ||
|
|
0b73041b16 | ||
|
|
9579bd47e5 | ||
|
|
09d47af2c0 | ||
|
|
ef0238ba2b | ||
|
|
a9f6f7ad39 | ||
|
|
1d254d321b | ||
|
|
41646ed006 | ||
|
|
3679781872 | ||
|
|
38dcf3454b | ||
|
|
e34d57ca90 | ||
|
|
20f492c298 | ||
|
|
c7c82be1c3 | ||
|
|
9564f688c4 | ||
|
|
90c1776c86 | ||
|
|
9cf861e8fa | ||
|
|
9b7b1da133 | ||
|
|
a5ab891292 | ||
|
|
90bb4ac821 | ||
|
|
23a0d1bc1f | ||
|
|
0e96c378fd | ||
|
|
ee16efff3c | ||
|
|
0197519dd7 | ||
|
|
865829cfac | ||
|
|
0571c3187b | ||
|
|
d12a2d0d04 | ||
|
|
2d369bd916 | ||
|
|
93843c55b6 | ||
|
|
e3a6132e12 | ||
|
|
736f0146c3 | ||
|
|
897fc2b6ef | ||
|
|
441c116105 | ||
|
|
8ecd80a34a | ||
|
|
4ba53db0da | ||
|
|
6c365ff648 | ||
|
|
e33bcdbb7b | ||
|
|
ec6b354c32 | ||
|
|
292d1af1a0 | ||
|
|
325b398e3c | ||
|
|
6f5667b4d4 | ||
|
|
cceeee7806 | ||
|
|
0a4546b742 | ||
|
|
b1eed27a54 | ||
|
|
1a3ad4b670 | ||
|
|
86a5f98e4a | ||
|
|
1caa44bea9 | ||
|
|
dbbf92c1d1 | ||
|
|
cb429d6b12 | ||
|
|
b0bded3f2f | ||
|
|
f9aaf22fc3 | ||
|
|
35ff3c731d | ||
|
|
63fa6c832e | ||
|
|
e4e5042e38 | ||
|
|
ae53e3e233 | ||
|
|
074d9bff7f | ||
|
|
f36862603a | ||
|
|
47691c031f | ||
|
|
ce7ddd8921 | ||
|
|
950c047b49 | ||
|
|
46509953a9 | ||
|
|
db348dcff2 | ||
|
|
a33f471065 | ||
|
|
ece3ce581e | ||
|
|
8189a98d85 | ||
|
|
d7a77091a3 | ||
|
|
3e1e74fca6 | ||
|
|
33b5670122 | ||
|
|
95e19e2e23 | ||
|
|
99ac042702 | ||
|
|
774b9f8653 | ||
|
|
eb1d2344f7 | ||
|
|
6fa9860dbe | ||
|
|
0cc36770f1 | ||
|
|
558cd543bf | ||
|
|
bd906e3410 | ||
|
|
35086cb501 | ||
|
|
2056ffc227 | ||
|
|
7745439312 | ||
|
|
c4b5abbe43 | ||
|
|
f87842483e | ||
|
|
3dbb32c734 | ||
|
|
609ea80276 | ||
|
|
3dfecaaf7c | ||
|
|
3165c915b6 | ||
|
|
457ccc42c9 | ||
|
|
00880c720a | ||
|
|
856bc36533 | ||
|
|
fe71887b68 | ||
|
|
10094bd885 | ||
|
|
eea0c0f2ed | ||
|
|
85be43e0df | ||
|
|
0cb9e9fc8d | ||
|
|
cb61d3b46b | ||
|
|
113840da12 | ||
|
|
deb2e66bcc | ||
|
|
9b2d69aa80 | ||
|
|
e3ff4cdd23 | ||
|
|
0745ba43a4 | ||
|
|
3ede843d50 | ||
|
|
2e8d6e8690 | ||
|
|
69a5558203 | ||
|
|
d6905403e3 | ||
|
|
411926b572 | ||
|
|
439b93f6d2 | ||
|
|
d6cf67778c | ||
|
|
b94dab5250 | ||
|
|
6178974cd9 | ||
|
|
0b9e4d1278 | ||
|
|
63fa3c3f8f | ||
|
|
3612d9a57a | ||
|
|
b60de4447a | ||
|
|
16dddb760e | ||
|
|
eff7c9166e | ||
|
|
f1bf2603e6 | ||
|
|
6f32991eae | ||
|
|
202fc9e8ed | ||
|
|
e378b24487 | ||
|
|
3628b22d49 | ||
|
|
af2b0d0205 | ||
|
|
4bf988959a | ||
|
|
a0e4fb3a28 | ||
|
|
2c445be8ba | ||
|
|
e3f4063683 | ||
|
|
6bbe6d5b92 | ||
|
|
89ae305e11 | ||
|
|
da8d7f09f1 | ||
|
|
25c986db5a | ||
|
|
a8f249458d | ||
|
|
bc5b35367f | ||
|
|
930aff2c2e | ||
|
|
ac3e2a3fdd | ||
|
|
9ccb12b031 | ||
|
|
e18a2c22db | ||
|
|
b716c0ef01 | ||
|
|
2efa3b70dc | ||
|
|
49959d4f1c | ||
|
|
0f27a03607 | ||
|
|
c2a8ebfe69 | ||
|
|
43aac5bacc | ||
|
|
bff2b7c94d | ||
|
|
2d45a262d9 | ||
|
|
ed652d8136 | ||
|
|
6fe0f1fab9 | ||
|
|
b0beb0b1ca | ||
|
|
018dec8588 | ||
|
|
5d6209e1f9 | ||
|
|
601b711c78 | ||
|
|
78702753f2 | ||
|
|
7aa1ff8ff6 | ||
|
|
d6c97cf010 | ||
|
|
1b2508362b | ||
|
|
cd898af59f | ||
|
|
0a535e58d8 | ||
|
|
9ce9e295fe | ||
|
|
9a38592c79 | ||
|
|
9b3965b08c | ||
|
|
531cb4f673 | ||
|
|
3559c5d7a2 | ||
|
|
8631e2976a | ||
|
|
2768bc1764 | ||
|
|
6f4698ee1f | ||
|
|
85e5165e98 | ||
|
|
17c16f2a71 | ||
|
|
91c3f86c2b | ||
|
|
75b1f3becc | ||
|
|
07c5e549b2 | ||
|
|
114eb159a4 | ||
|
|
005cce5507 | ||
|
|
b859b6e79d | ||
|
|
b212a2fb9f | ||
|
|
e40416567a | ||
|
|
b37e5fa2f8 | ||
|
|
326469ef4a | ||
|
|
c73d8ee40d | ||
|
|
abef2ea770 | ||
|
|
b26e32c3af | ||
|
|
7822eff936 | ||
|
|
865676682d | ||
|
|
0f7776af0b | ||
|
|
b03dc011be | ||
|
|
00ce35336e | ||
|
|
723776ddf7 | ||
|
|
5a77ec7f1c | ||
|
|
2fb11f873b | ||
|
|
ad63647446 | ||
|
|
87315e8a8d | ||
|
|
9031ebd7d5 | ||
|
|
12b41d5598 | ||
|
|
d2b11c4777 | ||
|
|
7bc0e4a2e0 | ||
|
|
d3ec787f77 | ||
|
|
2c309c235d | ||
|
|
3dec81200c | ||
|
|
737724607f | ||
|
|
77edf82c7f | ||
|
|
6232237dba | ||
|
|
7d81acc762 | ||
|
|
18d8a67485 | ||
|
|
043128cbe5 | ||
|
|
3331ca492d | ||
|
|
346e30a46a | ||
|
|
83de62c20d | ||
|
|
658da9a769 | ||
|
|
be24c66a7c | ||
|
|
4b548857d6 | ||
|
|
d71fe4ed4e | ||
|
|
a554712439 | ||
|
|
5d26223f4a | ||
|
|
980ab349bc | ||
|
|
d67babf345 | ||
|
|
7f11e33e8d | ||
|
|
ad38bd0e89 | ||
|
|
47b639cc9b | ||
|
|
213c0e7abb | ||
|
|
043f3d6faa | ||
|
|
fdf71d66b3 | ||
|
|
6cfd6195c5 | ||
|
|
5163a85d40 | ||
|
|
dbf9ad1f3d |
24
.drone.yml
24
.drone.yml
@@ -190,3 +190,27 @@ steps:
|
||||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C cpp_thread_test dgemm_tester
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc10
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:20.04
|
||||
environment:
|
||||
CC: gcc-10
|
||||
FC: gfortran-10
|
||||
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran-10 perl python g++
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
|
||||
|
||||
5
.github/workflows/nightly-Homebrew-build.yml
vendored
5
.github/workflows/nightly-Homebrew-build.yml
vendored
@@ -44,6 +44,11 @@ jobs:
|
||||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
- name: unlink installed gcc to allow updating
|
||||
run: |
|
||||
brew unlink gcc@8
|
||||
brew unlink gcc@9
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
|
||||
2
.gitignore
vendored
2
.gitignore
vendored
@@ -89,5 +89,7 @@ build.*
|
||||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
.vscode
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
.vscode
|
||||
|
||||
15
.travis.yml
15
.travis.yml
@@ -224,12 +224,21 @@ matrix:
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc@10
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
- BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode12
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
|
||||
|
||||
# - <<: *test-macos
|
||||
# osx_image: xcode10
|
||||
# env:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 12.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 14.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -14,6 +14,9 @@ include(GNUInstallDirs)
|
||||
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(MSVC AND NOT DEFINED NOFORTRAN)
|
||||
set(NOFORTRAN ON)
|
||||
endif()
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
@@ -229,7 +232,7 @@ if (NOT NO_CBLAS)
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC AND NOT NOFORTRAN)
|
||||
if (NOT NOFORTRAN)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
|
||||
@@ -1,4 +1,102 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.14
|
||||
17-Mar-2021
|
||||
|
||||
common:
|
||||
* Fixed a race condition on thread shutdown in non-OpenMP builds
|
||||
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
|
||||
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
|
||||
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
|
||||
* Improved performance of OMATCOPY_RT across all platforms
|
||||
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
|
||||
* Fixed potential misreading of the GCC compiler version in the build scripts
|
||||
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
|
||||
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
|
||||
|
||||
RISCV:
|
||||
* Fixed compilation on RISCV (missing entry in getarch)
|
||||
|
||||
POWER:
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
|
||||
* Added support for compilation on FreeBSD/ppc64le
|
||||
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
|
||||
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
|
||||
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
|
||||
* Improved SCOPY and CCOPY performance on POWER10
|
||||
* Improved SGEMM and DGEMM performance on POWER10
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 GEMM kernel for Cooperlake
|
||||
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
|
||||
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
|
||||
* Added support for compilation with the NAG Fortran compiler
|
||||
* Fixed recognition of the AMD AOCC compiler
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
|
||||
* Added support for running the BLAS/CBLAS tests on Windows
|
||||
* Fixed signatures of the tls callback functions for Windows x64
|
||||
* Fixed various issues with fma intrinsics support handling
|
||||
|
||||
ARM:
|
||||
* Added support for embedded Cortex M targets via a new option EMBEDDED
|
||||
|
||||
ARMV8:
|
||||
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
|
||||
* Added support for the DYNAMIC_LIST option
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
* Added support for compiling with the NAG Fortran compiler
|
||||
|
||||
====================================================================
|
||||
Version 0.3.13
|
||||
12-Dec-2020
|
||||
|
||||
common:
|
||||
* Added a generic bfloat16 SBGEMV kernel
|
||||
* Fixed a potentially severe memory leak after fork in OpenMP builds
|
||||
that was introduced in 0.3.12
|
||||
* Added detection of the Fujitsu Fortran compiler
|
||||
* Added detection of the (e)gfortran compiler on OpenBSD
|
||||
* Added support for overriding the default name of the library independently
|
||||
from symbol suffixing in the gmake builds (already supported in cmake)
|
||||
|
||||
RISCV:
|
||||
* Added a RISC V port optimized for C910V
|
||||
|
||||
POWER:
|
||||
* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
|
||||
* Improved DGEMM performance on POWER10
|
||||
* Improved STRSM and DTRSM performance on POWER9 and POWER10
|
||||
* Fixed segmemtation faults in DYNAMIC_ARCH builds
|
||||
* Fixed compilation with the PGI compiler
|
||||
|
||||
x86:
|
||||
* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
|
||||
* Improved the performance of SASUM and DASUM kernels through parallelization
|
||||
* Improved the performance of SROT and DROT kernels
|
||||
* Improved the performance of multithreaded xSYRK
|
||||
* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
|
||||
(where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
|
||||
wrong results)
|
||||
* Fixed miscompilations by old gcc 4.6
|
||||
* Fixed misdetection of AVX2 capability in some Sandybridge cpus
|
||||
* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
|
||||
|
||||
ARM64:
|
||||
* Fixed segmemtation faults in DYNAMIC_ARCH builds
|
||||
|
||||
MIPS:
|
||||
* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
|
||||
* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
|
||||
* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
|
||||
* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
|
||||
|
||||
SPARC:
|
||||
* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
|
||||
|
||||
====================================================================
|
||||
Version 0.3.12
|
||||
24-Oct-2020
|
||||
|
||||
6
Makefile
6
Makefile
@@ -59,6 +59,9 @@ endif
|
||||
@$(CC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${cverinfo}" ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
|
||||
else \
|
||||
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
|
||||
@@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
@$(FC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${fverinfo}" ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
|
||||
else \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
|
||||
|
||||
@@ -1,28 +1,38 @@
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
@@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1)
|
||||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VORTEX)
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -10,9 +10,11 @@ USE_OPENMP = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.12.dev
|
||||
VERSION = 0.3.14.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
||||
@@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc64le)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
@@ -181,7 +183,7 @@ endif
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
@@ -623,6 +625,15 @@ DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
@@ -659,6 +670,7 @@ endif
|
||||
endif # ARCH zarch
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
@@ -672,7 +684,7 @@ DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
|
||||
LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
|
||||
ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
|
||||
DYNAMIC_CORE += POWER10
|
||||
CCOMMON_OPT += -DHAVE_P10_SUPPORT
|
||||
@@ -685,6 +697,10 @@ else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
DYNAMIC_CORE = POWER8
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
@@ -787,14 +803,9 @@ CCOMMON_OPT += -mabi=32
|
||||
BINARY_DEFINED = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
|
||||
CCOMMON_OPT += -march=loongson3a
|
||||
FCOMMON_OPT += -march=loongson3a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), MIPS24K)
|
||||
@@ -848,9 +859,19 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
||||
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
|
||||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
|
||||
NEWPGI := 1
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
@@ -878,13 +899,25 @@ endif
|
||||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||
ifeq ($(FLANG_VENDOR),AOCC)
|
||||
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
|
||||
ifeq ($(FLANG_VENDOR), AMD)
|
||||
FCOMMON_OPT += -fno-unroll-loops
|
||||
endif
|
||||
endif
|
||||
@@ -1030,18 +1063,24 @@ ifeq ($(ARCH), x86_64)
|
||||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER6)
|
||||
$(warning NVIDIA HPC compilers do not support POWER6.)
|
||||
endif
|
||||
ifeq ($(CORE), POWER8)
|
||||
FCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
FCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
ifeq ($(CORE), POWER10)
|
||||
$(warning NVIDIA HPC compilers do not support POWER10.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -tp p7
|
||||
endif
|
||||
FCOMMON_OPT += -Mrecursive
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -mp
|
||||
endif
|
||||
@@ -1078,11 +1117,11 @@ FCOMMON_OPT += -n32
|
||||
else
|
||||
FCOMMON_OPT += -n64
|
||||
endif
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
ifeq ($(CORE), LOONGSON3R3)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
FCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
@@ -1108,11 +1147,11 @@ CCOMMON_OPT += -n32
|
||||
else
|
||||
CCOMMON_OPT += -n64
|
||||
endif
|
||||
ifeq ($(CORE), LOONGSON3A)
|
||||
ifeq ($(CORE), LOONGSON3R3)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), LOONGSON3B)
|
||||
ifeq ($(CORE), LOONGSON3R4)
|
||||
CCOMMON_OPT += -loongson3 -static
|
||||
endif
|
||||
|
||||
@@ -1180,6 +1219,8 @@ CCOMMON_OPT += -fPIC
|
||||
endif
|
||||
ifeq ($(F_COMPILER), SUN)
|
||||
FCOMMON_OPT += -pic
|
||||
else ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -PIC
|
||||
else
|
||||
FCOMMON_OPT += -fPIC
|
||||
endif
|
||||
@@ -1223,10 +1264,8 @@ ifdef SMP
|
||||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
# USE_SIMPLE_THREADED_LEVEL3 = 1
|
||||
@@ -1259,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI
|
||||
EXTRALIB += -lpapi -lperfctr
|
||||
endif
|
||||
|
||||
ifdef BUFFERSIZE
|
||||
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
|
||||
endif
|
||||
|
||||
ifdef DYNAMIC_THREADS
|
||||
CCOMMON_OPT += -DDYNAMIC_THREADS
|
||||
endif
|
||||
@@ -1342,11 +1385,9 @@ endif
|
||||
|
||||
ifneq ($(ARCH), x86_64)
|
||||
ifneq ($(ARCH), x86)
|
||||
ifneq ($(CORE), LOONGSON3B)
|
||||
NO_AFFINITY = 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
ifeq ($(NO_AFFINITY), 0)
|
||||
@@ -1438,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS)
|
||||
LAPACK_FPFLAGS := $(FPFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||
ifdef INTERFACE64
|
||||
|
||||
@@ -10,40 +10,46 @@ endif
|
||||
|
||||
ifdef HAVE_SSE3
|
||||
CCOMMON_OPT += -msse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CCOMMON_OPT += -mssse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mssse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSE4_1
|
||||
CCOMMON_OPT += -msse4.1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse4.1
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_AVX
|
||||
CCOMMON_OPT += -mavx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifndef NO_AVX2
|
||||
ifdef HAVE_AVX2
|
||||
CCOMMON_OPT += -mavx2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
CCOMMON_OPT += -mfma
|
||||
FCOMMON_OPT += -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
@@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC)
|
||||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
||||
@@ -13,10 +13,14 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
|
||||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
|
||||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
@@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
||||
## Usage
|
||||
|
||||
|
||||
20
appveyor.yml
20
appveyor.yml
@@ -30,10 +30,10 @@ environment:
|
||||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||
matrix:
|
||||
- COMPILER: clang-cl
|
||||
WITH_FORTRAN: yes
|
||||
WITH_FORTRAN: ON
|
||||
- COMPILER: clang-cl
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
WITH_FORTRAN: OFF
|
||||
- COMPILER: cl
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
@@ -47,12 +47,7 @@ environment:
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
|
||||
|
||||
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
|
||||
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
||||
@@ -68,15 +63,14 @@ before_build:
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
||||
test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
- ctest -j2
|
||||
|
||||
|
||||
@@ -68,4 +68,13 @@ jobs:
|
||||
dir
|
||||
openblas_utest.exe
|
||||
|
||||
- job: OSX_OpenMP
|
||||
pool:
|
||||
vmImage: 'macOS-10.15'
|
||||
steps:
|
||||
- script: |
|
||||
brew update
|
||||
make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 CC=gcc-10 FC=gfortran-10
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
#include <time.h>
|
||||
#ifdef __CYGWIN32__
|
||||
#include <sys/time.h>
|
||||
#elif defined(__APPLE__)
|
||||
#include <mach/mach_time.h>
|
||||
#endif
|
||||
#include "common.h"
|
||||
|
||||
@@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){
|
||||
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
struct timeval start, stop;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info_data_t info;
|
||||
uint64_t start = 0, stop = 0;
|
||||
#else
|
||||
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||
#endif
|
||||
@@ -82,6 +87,9 @@ double getsec()
|
||||
{
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info(&info);
|
||||
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
|
||||
#else
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
|
||||
#endif
|
||||
@@ -90,6 +98,8 @@ double getsec()
|
||||
void begin() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
#endif
|
||||
@@ -98,7 +108,9 @@ void begin() {
|
||||
void end() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
6
c_check
6
c_check
@@ -1,11 +1,11 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
@@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
||||
} else {
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
|
||||
5
cblas.h
5
cblas.h
@@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
|
||||
|
||||
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
|
||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||
void cblas_crotg(void *a, void *b, float *c, void *s);
|
||||
void cblas_zrotg(void *a, void *b, double *c, void *s);
|
||||
|
||||
|
||||
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
|
||||
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
|
||||
|
||||
@@ -45,6 +45,9 @@ endif ()
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
|
||||
@@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC})
|
||||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
||||
@@ -148,16 +148,20 @@ endif ()
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
@@ -233,6 +237,11 @@ if (BINARY64)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if(EMBEDDED)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
|
||||
endif()
|
||||
|
||||
if (NEED_PIC)
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
||||
|
||||
@@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
||||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
|
||||
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
|
||||
endif ()
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
|
||||
13
common.h
13
common.h
@@ -122,7 +122,7 @@ extern "C" {
|
||||
#define ATOM GOTO_ATOM
|
||||
#undef GOTO_ATOM
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
@@ -134,6 +134,9 @@ extern "C" {
|
||||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#else
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_SUNOS)
|
||||
@@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){
|
||||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
@@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
#include "common_linux.h"
|
||||
#endif
|
||||
|
||||
#ifdef OS_EMBEDDED
|
||||
#define DTB_DEFAULT_ENTRIES 64
|
||||
#endif
|
||||
|
||||
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
||||
|
||||
#ifdef __NetBSD__
|
||||
|
||||
@@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
||||
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
|
||||
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
|
||||
return 0;
|
||||
#else
|
||||
#if defined (LOONGSON3B)
|
||||
#if defined (__64BIT__)
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#else
|
||||
return 0; //NULL Implementation on Loongson 3B 32bit.
|
||||
#endif
|
||||
#else
|
||||
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
|
||||
// unsigned long null_nodemask=0;
|
||||
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
|
||||
|
||||
@@ -229,12 +229,7 @@ REALNAME: ;\
|
||||
|
||||
#define BUFFER_SIZE ( 32 << 21)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
#define FIXED_PAGESIZE (16UL << 10)
|
||||
#endif
|
||||
@@ -250,7 +245,7 @@ REALNAME: ;\
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3A) || defined(LOONGSON3B)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#define PREFETCHD_(x) ld $0, x
|
||||
#define PREFETCHD(x) PREFETCHD_(x)
|
||||
#else
|
||||
|
||||
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
#define CPU_I6500 6
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_SICORTEX 1
|
||||
#define CPU_LOONGSON3R3 2
|
||||
#define CPU_LOONGSON3R4 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
#define CPU_I6500 6
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKNOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B",
|
||||
"LOONGSON3R3",
|
||||
"LOONGSON3R4",
|
||||
"I6400",
|
||||
"P6600",
|
||||
"I6500"
|
||||
@@ -90,48 +90,13 @@ static char *cpuname[] = {
|
||||
|
||||
int detect(void){
|
||||
|
||||
#ifdef __linux
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
@@ -140,14 +105,16 @@ int detect(void){
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
|
||||
return CPU_LOONGSON3R3;
|
||||
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
|
||||
return CPU_LOONGSON3R4;
|
||||
} else{
|
||||
return CPU_SICORTEX;
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
|
||||
char *get_corename(void){
|
||||
@@ -159,10 +126,10 @@ void get_architecture(void){
|
||||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("LOONGSON3A");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("LOONGSON3B");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("LOONGSON3R3");
|
||||
}else if(detect()==CPU_LOONGSON3R4){
|
||||
printf("LOONGSON3R4");
|
||||
}else if(detect()==CPU_I6400){
|
||||
printf("I6400");
|
||||
}else if(detect()==CPU_P6600){
|
||||
@@ -179,8 +146,8 @@ void get_subdirname(void){
|
||||
}
|
||||
|
||||
void get_cpuconfig(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("#define LOONGSON3A\n");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("#define LOONGSON3R3\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
@@ -188,8 +155,8 @@ void get_cpuconfig(void){
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else if(detect()==CPU_LOONGSON3B){
|
||||
printf("#define LOONGSON3B\n");
|
||||
}else if(detect()==CPU_LOONGSON3R4){
|
||||
printf("#define LOONGSON3R4\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 512488\n");
|
||||
@@ -237,10 +204,10 @@ void get_cpuconfig(void){
|
||||
}
|
||||
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_LOONGSON3A) {
|
||||
printf("loongson3a\n");
|
||||
}else if(detect()==CPU_LOONGSON3B) {
|
||||
printf("loongson3b\n");
|
||||
if(detect()==CPU_LOONGSON3R3) {
|
||||
printf("loongson3r3\n");
|
||||
}else if(detect()==CPU_LOONGSON3R4) {
|
||||
printf("loongson3r4\n");
|
||||
}else if(detect()==CPU_I6400) {
|
||||
printf("i6400\n");
|
||||
}else if(detect()==CPU_P6600) {
|
||||
|
||||
41
cpuid_x86.c
41
cpuid_x86.c
@@ -1418,6 +1418,15 @@ int get_cpuname(void){
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 12: // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
@@ -1436,6 +1445,15 @@ int get_cpuname(void){
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
@@ -2014,6 +2032,19 @@ int get_coretype(void){
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 5:
|
||||
switch (model) {
|
||||
@@ -2102,6 +2133,16 @@ int get_coretype(void){
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
|
||||
@@ -5,9 +5,18 @@ enable_language(Fortran)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
"$ErrorActionPreference = \"Stop\"\n"
|
||||
"Get-Content $args[1] | & $args[0]\n"
|
||||
)
|
||||
set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
|
||||
else()
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
|
||||
"$1 < $2\n"
|
||||
)
|
||||
set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
|
||||
endif()
|
||||
|
||||
foreach(float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
||||
@@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
c_${float_char}blas1.c)
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
|
||||
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
|
||||
|
||||
#level2
|
||||
add_executable(x${float_char}cblat2
|
||||
@@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
|
||||
#level3
|
||||
add_executable(x${float_char}cblat3
|
||||
@@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES})
|
||||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
endforeach()
|
||||
|
||||
@@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
CEXTRALIB = -lgomp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
||||
@@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
/*
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
*/
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#endif
|
||||
|
||||
|
||||
@@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
/*
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
*/
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
#endif
|
||||
/* Copy part of local region of B into workspace */
|
||||
|
||||
@@ -24,10 +24,14 @@ else
|
||||
ifeq ($(ARCH),zarch)
|
||||
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),mips64)
|
||||
COMMONOBJS += dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
@@ -92,10 +96,14 @@ else
|
||||
ifeq ($(ARCH),zarch)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),mips64)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
||||
@@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) {
|
||||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
@@ -1022,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
int i;
|
||||
|
||||
if (!blas_server_avail) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
if (blas_server_avail) {
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
|
||||
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_destory(&attr);
|
||||
pthread_attr_destroy(&attr);
|
||||
#endif
|
||||
|
||||
blas_server_avail = 0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&server_lock);
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
@@ -656,7 +671,7 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
@@ -666,7 +681,20 @@ static gotoblas_t *get_coretype(void){
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}
|
||||
if (model == 7) {
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
|
||||
@@ -43,6 +43,63 @@
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
#ifdef DYNAMIC_LIST
|
||||
#ifdef DYN_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#else
|
||||
#define gotoblas_CORTEXA53 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA57
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
#else
|
||||
#define gotoblas_CORTEXA57 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA72
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
#else
|
||||
#define gotoblas_CORTEXA72 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA73
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
#else
|
||||
#define gotoblas_CORTEXA73 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_FALKOR
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#else
|
||||
#define gotoblas_FALKOR gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_TSV110
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
#else
|
||||
#define gotoblas_TSV110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
#else
|
||||
#define gotoblas_THUNDERX gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX2T99
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
#else
|
||||
#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX3T110
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#else
|
||||
#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_EMAG8180
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
#else
|
||||
#define gotoblas_EMAG8180 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEN1
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
@@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110;
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
@@ -68,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__("mrs %0, "#id : "=r" (var)); \
|
||||
__asm__ ("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
|
||||
230
driver/others/dynamic_mips64.c
Normal file
230
driver/others/dynamic_mips64.c
Normal file
@@ -0,0 +1,230 @@
|
||||
/*****************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
**********************************************************************************/
|
||||
|
||||
#include <sys/wait.h>
|
||||
#include <stdio.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <sys/resource.h>
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_LOONGSON3R3;
|
||||
extern gotoblas_t gotoblas_LOONGSON3R4;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 2
|
||||
|
||||
static char *corename[] = {
|
||||
"loongson3r3",
|
||||
"loongson3r4",
|
||||
"UNKNOWN"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0];
|
||||
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype) {
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for ( i=0 ; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_LOONGSON3R3);
|
||||
case 1: return (&gotoblas_LOONGSON3R4);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#define MMI_MASK 0x00000010
|
||||
#define MSA_MASK 0x00000020
|
||||
|
||||
int fd[2];
|
||||
int support_cpucfg;
|
||||
|
||||
static void handler(int signum)
|
||||
{
|
||||
close(fd[1]);
|
||||
exit(1);
|
||||
}
|
||||
|
||||
/* Brief : Function to check if cpucfg supported on loongson
|
||||
* Return: 1 supported
|
||||
* 0 not supported
|
||||
*/
|
||||
static int cpucfg_test(void) {
|
||||
pid_t pid;
|
||||
int status = 0;
|
||||
|
||||
support_cpucfg = 0;
|
||||
pipe(fd);
|
||||
pid = fork();
|
||||
if (pid == 0) { /* Subprocess */
|
||||
struct sigaction act;
|
||||
close(fd[0]);
|
||||
/* Set signal action for SIGILL. */
|
||||
act.sa_handler = handler;
|
||||
sigaction(SIGILL,&act,NULL);
|
||||
|
||||
/* Execute cpucfg in subprocess. */
|
||||
__asm__ volatile(
|
||||
".insn \n\t"
|
||||
".word (0xc8080118) \n\t"
|
||||
:::
|
||||
);
|
||||
support_cpucfg = 1;
|
||||
write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
|
||||
close(fd[1]);
|
||||
exit(0);
|
||||
} else if (pid > 0){ /* Parent process*/
|
||||
close(fd[1]);
|
||||
if ((waitpid(pid,&status,0) <= 0) ||
|
||||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
|
||||
support_cpucfg = 0;
|
||||
close(fd[0]);
|
||||
} else {
|
||||
support_cpucfg = 0;
|
||||
}
|
||||
|
||||
return support_cpucfg;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype_from_cpucfg(void) {
|
||||
int flag = 0;
|
||||
__asm__ volatile(
|
||||
".insn \n\t"
|
||||
"dli $8, 0x01 \n\t"
|
||||
".word (0xc9084918) \n\t"
|
||||
"usw $9, 0x00(%0) \n\t"
|
||||
:
|
||||
: "r"(&flag)
|
||||
: "memory"
|
||||
);
|
||||
if (flag & MSA_MASK)
|
||||
return (&gotoblas_LOONGSON3R4);
|
||||
if (flag & MMI_MASK)
|
||||
return (&gotoblas_LOONGSON3R3);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype_from_cpuinfo(void) {
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[512], *p;
|
||||
|
||||
p = (char *)NULL;
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
|
||||
return (&gotoblas_LOONGSON3R3);
|
||||
else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
|
||||
return (&gotoblas_LOONGSON3R4);
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int ret = 0;
|
||||
|
||||
ret = cpucfg_test();
|
||||
if (ret == 1)
|
||||
return get_coretype_from_cpucfg();
|
||||
else
|
||||
return get_coretype_from_cpuinfo();
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_LOONGSON3R3;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
||||
@@ -27,7 +27,9 @@ static char *corename[] = {
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
#ifndef C_PGI
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
@@ -38,10 +40,164 @@ char *gotoblas_corename(void) {
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
static int __builtin_cpu_supports(char* arg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(__clang__)
|
||||
/*
|
||||
* NV HPC compilers do not yet implement __builtin_cpu_is().
|
||||
* Fake a version here for use in the CPU detection code below.
|
||||
*
|
||||
* Strategy here is to first check the CPU to see what it actually is,
|
||||
* and then test the input to see if what the CPU actually is matches
|
||||
* what was requested.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Define POWER processor version table.
|
||||
*
|
||||
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
|
||||
*/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_POWER5 5
|
||||
#define CPU_POWER6 6
|
||||
#define CPU_POWER8 8
|
||||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
static struct {
|
||||
uint32_t pvr_mask;
|
||||
uint32_t pvr_value;
|
||||
const char* cpu_name;
|
||||
uint32_t cpu_type;
|
||||
} pvrPOWER [] = {
|
||||
|
||||
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
|
||||
.pvr_mask = 0xffffffff,
|
||||
.pvr_value = 0x0f000001,
|
||||
.cpu_name = "POWER5+",
|
||||
.cpu_type = CPU_POWER5,
|
||||
},
|
||||
|
||||
{ /* Power6 aka POWER6X*/
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003e0000,
|
||||
.cpu_name = "POWER6 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003f0000,
|
||||
.cpu_name = "POWER7 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7+ */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004A0000,
|
||||
.cpu_name = "POWER7+ (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power8E */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004b0000,
|
||||
.cpu_name = "POWER8E (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8NVL */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004c0000,
|
||||
.cpu_name = "POWER8NVL (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004d0000,
|
||||
.cpu_name = "POWER8 (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.0 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0200,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD 2.1 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0201,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.2 or later */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004e0000,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power10 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x00800000,
|
||||
.cpu_name = "POWER10 (raw)",
|
||||
.cpu_type = CPU_POWER10,
|
||||
},
|
||||
|
||||
{ /* End of table, pvr_mask and pvr_value must be zero */
|
||||
.pvr_mask = 0x0,
|
||||
.pvr_value = 0x0,
|
||||
.cpu_name = "Unknown",
|
||||
.cpu_type = CPU_UNKNOWN,
|
||||
},
|
||||
};
|
||||
|
||||
static int __builtin_cpu_is(const char *cpu) {
|
||||
int i;
|
||||
uint32_t pvr;
|
||||
uint32_t cpu_type;
|
||||
|
||||
asm("mfpvr %0" : "=r"(pvr));
|
||||
|
||||
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
|
||||
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DEBUG)
|
||||
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
|
||||
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
|
||||
#endif
|
||||
cpu_type = pvrPOWER[i].cpu_type;
|
||||
|
||||
if (!strcmp(cpu, "power8"))
|
||||
return cpu_type == CPU_POWER8;
|
||||
if (!strcmp(cpu, "power9"))
|
||||
return cpu_type == CPU_POWER9;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* C_PGI */
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
#ifndef C_PGI
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
#endif
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
@@ -52,6 +208,11 @@ static gotoblas_t *get_coretype(void) {
|
||||
if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
|
||||
return &gotoblas_POWER10;
|
||||
#endif
|
||||
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
if (__builtin_cpu_is("power10"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -72,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
|
||||
switch (found)
|
||||
{
|
||||
#ifndef C_PGI
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
#endif
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
|
||||
@@ -222,11 +222,11 @@ int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
@@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
@@ -1619,10 +1619,12 @@ static int on_process_term(void)
|
||||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
@@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
|
||||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const int(*p_process_term)(void) = on_process_term;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
@@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#define ALLOC_MMAP
|
||||
#define ALLOC_MALLOC
|
||||
#else
|
||||
#define ALLOC_MALLOC
|
||||
|
||||
inline int puts(const char *str) { return 0; }
|
||||
inline int printf(const char *format, ...) { return 0; }
|
||||
inline char *getenv(const char *name) { return ""; }
|
||||
inline int atoi(const char *str) { return 0; }
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
||||
@@ -717,7 +717,7 @@ void blas_set_parameter(void){
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3A)
|
||||
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
@@ -731,20 +731,6 @@ void blas_set_parameter(void){
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(LOONGSON3B)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1 || blas_num_threads == 2){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 640;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 160;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Changelog
|
||||
# 2017/09/03 staticfloat
|
||||
|
||||
52
f_check
52
f_check
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
@@ -32,9 +32,9 @@ if ($compiler eq "") {
|
||||
"xlf95", "xlf90", "xlf",
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77",
|
||||
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
|
||||
"flang", "egfortran",
|
||||
"ifort");
|
||||
"ifort", "nagfor");
|
||||
|
||||
OUTER:
|
||||
foreach $lists (@lists) {
|
||||
@@ -64,7 +64,9 @@ if ($compiler eq "") {
|
||||
if (!$?) {
|
||||
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
|
||||
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
|
||||
}
|
||||
if ($data =~ /zhoge_/) {
|
||||
$bu = "_";
|
||||
}
|
||||
@@ -76,6 +78,7 @@ if ($compiler eq "") {
|
||||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
@@ -87,7 +90,7 @@ if ($compiler eq "") {
|
||||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/) {
|
||||
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
@@ -123,7 +126,7 @@ if ($compiler eq "") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /PGF/) {
|
||||
if ($data =~ /PGF/ || $data =~ /NVF/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
@@ -133,8 +136,16 @@ if ($compiler eq "") {
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /NAG/) {
|
||||
$vendor = NAG;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
|
||||
}
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
@@ -177,7 +188,7 @@ if ($compiler eq "") {
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pgf/) {
|
||||
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
@@ -222,6 +233,12 @@ if ($compiler eq "") {
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /nagfor/) {
|
||||
$vendor = NAG;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
@@ -275,14 +292,20 @@ if (!$?) {
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For nagfor
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
||||
if ($binary eq "") {
|
||||
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $vendor eq "NAG") {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
@@ -330,12 +353,13 @@ if ($link ne "") {
|
||||
$flags =~ s/\@/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
$flags = "-lomp";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
@@ -352,6 +376,16 @@ if ($link ne "") {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
|
||||
25
getarch.c
25
getarch.c
@@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/* #define FORCE_PPC440FP2 */
|
||||
/* #define FORCE_CELL */
|
||||
/* #define FORCE_SICORTEX */
|
||||
/* #define FORCE_LOONGSON3A */
|
||||
/* #define FORCE_LOONGSON3B */
|
||||
/* #define FORCE_LOONGSON3R3 */
|
||||
/* #define FORCE_LOONGSON3R4 */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
@@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef FORCE_LOONGSON3A
|
||||
#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "LOONGSON3A"
|
||||
#define SUBARCHITECTURE "LOONGSON3R3"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DLOONGSON3A " \
|
||||
#define ARCHCONFIG "-DLOONGSON3R3 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "loongson3a"
|
||||
#define CORENAME "LOONGSON3A"
|
||||
#define LIBNAME "loongson3r3"
|
||||
#define CORENAME "LOONGSON3R3"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_LOONGSON3B
|
||||
#ifdef FORCE_LOONGSON3R4
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "LOONGSON3B"
|
||||
#define SUBARCHITECTURE "LOONGSON3R4"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DLOONGSON3B " \
|
||||
#define ARCHCONFIG "-DLOONGSON3R4 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
|
||||
#define LIBNAME "loongson3b"
|
||||
#define CORENAME "LOONGSON3B"
|
||||
#define LIBNAME "loongson3r4"
|
||||
#define CORENAME "LOONGSON3R4"
|
||||
#else
|
||||
#endif
|
||||
|
||||
@@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#ifdef __riscv
|
||||
#include "cpuid_riscv64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __arm__
|
||||
|
||||
@@ -4,6 +4,14 @@
|
||||
#else
|
||||
#include "config_kernel.h"
|
||||
#endif
|
||||
#if (defined(__WIN32__) || defined(__WIN64__) || defined(__CYGWIN32__) || defined(__CYGWIN64__) || defined(_WIN32) || defined(_WIN64)) && defined(__64BIT__)
|
||||
typedef long long BLASLONG;
|
||||
typedef unsigned long long BLASULONG;
|
||||
#else
|
||||
typedef long BLASLONG;
|
||||
typedef unsigned long BLASULONG;
|
||||
#endif
|
||||
|
||||
#include "param.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
@@ -316,7 +316,7 @@ CCBLAS1OBJS = \
|
||||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
@@ -346,7 +346,7 @@ CZBLAS1OBJS = \
|
||||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
@@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
|
||||
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
@@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
|
||||
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
|
||||
|
||||
cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$count = 0;
|
||||
|
||||
|
||||
@@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
|
||||
#ifdef SMP
|
||||
double MNK;
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
@@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
|
||||
int nodes;
|
||||
@@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
#endif
|
||||
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
|
||||
@@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
dq1 = dp1 * *dx1;
|
||||
if(ABS(dq1) > ABS(dq2))
|
||||
{
|
||||
dflag = ZERO;
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dh21 = - dy1 / *dx1;
|
||||
|
||||
@@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
@@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
@@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), HASWELL)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
@@ -68,6 +70,9 @@ else
|
||||
TARGET_CORE = $(CORE)
|
||||
KDIR =
|
||||
TSUFFIX =
|
||||
ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||
override CFLAGS += $(MSA_FLAGS)
|
||||
endif
|
||||
endif
|
||||
|
||||
-include $(KERNELDIR)/KERNEL.$(TARGET_CORE)
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
### AMAX ###
|
||||
|
||||
ifndef SAMAXKERNEL
|
||||
@@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
||||
$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
@@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), LOONGSON3B)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
USE_TRMM = 1
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
@@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/*****************************************************
|
||||
* 2014/06/09 Saar
|
||||
*
|
||||
* Order rowMajor
|
||||
* Trans
|
||||
*
|
||||
******************************************************/
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i,j;
|
||||
FLOAT *aptr,*bptr;
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
|
||||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
if (rows <= 0) return 0;
|
||||
if (cols <= 0) return 0;
|
||||
|
||||
aptr = a;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for ( i=0; i<rows ; i++ )
|
||||
{
|
||||
bptr = &b[i];
|
||||
for(j=0; j<cols; j++)
|
||||
{
|
||||
bptr[j*ldb] = alpha * aptr[j];
|
||||
}
|
||||
aptr += lda;
|
||||
}
|
||||
i = (rows >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
return(0);
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 4;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0) {
|
||||
do {
|
||||
/* Column 1 of MAT_B */
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
/* Column 2 of MAT_B */
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
/* Column 3 of MAT_B */
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
|
||||
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
|
||||
|
||||
/* Column 4 of MAT_B */
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
|
||||
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
a_offset3 += 4;
|
||||
a_offset4 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
} // if(j > 0)
|
||||
|
||||
|
||||
if (cols & 2) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
a_offset3 += 2;
|
||||
a_offset4 += 2;
|
||||
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
if (cols & 1) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
}
|
||||
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
|
||||
if (rows & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 2;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
}
|
||||
} // if (rows & 2)
|
||||
|
||||
|
||||
if (rows & 1) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset += lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
b_offset1 += ldb * 2;
|
||||
}
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
#else
|
||||
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
i++ ;
|
||||
|
||||
}
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
|
||||
@@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
|
||||
@@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
@@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
||||
@@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot_thunderx.c
|
||||
DDOTKERNEL = ddot_thunderx.c
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
||||
@@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S
|
||||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
||||
@@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
||||
#define CUR_MAXINV "d8"
|
||||
#define CUR_MAXINV_V "v8.2d"
|
||||
#define CUR_MAX_V "v8.2d"
|
||||
#define REGINF "d9"
|
||||
|
||||
static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
double *ssq, double *scale)
|
||||
@@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
|
||||
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||
" mov x6, #0x7FF0000000000000 //+Infinity \n"
|
||||
" fmov "REGZERO", xzr \n"
|
||||
" fmov "REGONE", #1.0 \n"
|
||||
" fmov "REGINF", x6 \n"
|
||||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||
" mov "J", "N" \n"
|
||||
" cmp "J", xzr \n"
|
||||
@@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" ldr d4, ["X"] \n"
|
||||
" fabs d4, d4 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
@@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" ldr d3, ["X", #8] \n"
|
||||
" fabs d3, d3 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d3 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
@@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" fmaxp v24.2d, v24.2d, v26.2d \n"
|
||||
" fmaxp v24.2d, v24.2d, v24.2d \n"
|
||||
" fmax "CUR_MAX", "SCALE", d24 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
|
||||
" //dup "CUR_MAX_V", v7.d[0] \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
@@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" fmaxp v24.2d, v24.2d, v26.2d \n"
|
||||
" fmaxp v24.2d, v24.2d, v24.2d \n"
|
||||
" fmax "CUR_MAX", "SCALE", d24 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
|
||||
" //dup "CUR_MAX_V", v7.d[0] \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
@@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" ldr d4, ["X"] \n"
|
||||
" fabs d4, d4 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
@@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
" ldr d3, ["X", #8] \n"
|
||||
" fabs d3, d3 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d3 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
@@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
"9: //nrm2_kernel_L999: \n"
|
||||
" str "SSQ", [%[SSQ_]] \n"
|
||||
" str "SCALE", [%[SCALE_]] \n"
|
||||
" b 11f \n"
|
||||
"10: \n"
|
||||
" str "REGINF", [%[SSQ_]] \n"
|
||||
" str "REGINF", [%[SCALE_]] \n"
|
||||
"11: \n"
|
||||
|
||||
:
|
||||
: [SSQ_] "r" (ssq), //%0
|
||||
@@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
[INCX_] "r" (inc_x) //%4
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
|
||||
);
|
||||
|
||||
@@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
cur_ssq = *ptr;
|
||||
cur_scale = *(ptr + 1);
|
||||
|
||||
if (cur_ssq == INFINITY) {
|
||||
ssq = INFINITY;
|
||||
scale = INFINITY;
|
||||
break;
|
||||
}
|
||||
|
||||
if (cur_scale != 0) {
|
||||
if (cur_scale > scale) {
|
||||
scale = (scale / cur_scale);
|
||||
|
||||
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
|
||||
{ \
|
||||
LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
|
||||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
|
||||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
|
||||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
|
||||
\
|
||||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
|
||||
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
|
||||
{ \
|
||||
LD_SP2_INC(pa0, 4, src_a0, src_a1); \
|
||||
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
|
||||
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
|
||||
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
|
||||
\
|
||||
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
|
||||
|
||||
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CGEMV_T_8x4() \
|
||||
|
||||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
{
|
||||
if ((0 == c) && (0 == s))
|
||||
{
|
||||
v4f32 zero = __msa_cast_to_vector_float(0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
|
||||
v4f32 zero = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
/* process 2 elements */
|
||||
for (j = (n >> 1); j--;)
|
||||
|
||||
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
{
|
||||
if ((0.0 == da_r) && (0.0 == da_i))
|
||||
{
|
||||
v4f32 zero_v = __msa_cast_to_vector_float(0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
|
||||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
for (i = (n >> 5); i--;)
|
||||
{
|
||||
|
||||
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
{
|
||||
if (0.0 == da)
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
v2f64 zero_v = {0.0, 0.0};
|
||||
|
||||
for (i = (n >> 5); i--;)
|
||||
{
|
||||
|
||||
@@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if ((inc_x != 0) && (inc_y != 0))
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
@@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
else
|
||||
{
|
||||
if (inc_x == inc_y)
|
||||
{
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = *srcx;
|
||||
*srcx = *srcy;
|
||||
*srcy = x0;
|
||||
}
|
||||
else
|
||||
return (0);
|
||||
}
|
||||
else
|
||||
{
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
while (i < n)
|
||||
{
|
||||
x0 = srcx[ix];
|
||||
srcx[ix] = srcy[iy];
|
||||
srcy[iy] = x0;
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
|
||||
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
|
||||
|
||||
src_a54 = __msa_cast_to_vector_double(*(a + 54));
|
||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
|
||||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54));
|
||||
src_a62 = LD_DP(a + 62);
|
||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
|
||||
src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
|
||||
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
src_a44 = LD_DP(a + 44);
|
||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
|
||||
src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
|
||||
src_a36 = __msa_cast_to_vector_double(*(a + 36));
|
||||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
|
||||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36));
|
||||
|
||||
res_c7 *= src_a63;
|
||||
res_c6 -= res_c7 * src_a62;
|
||||
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
src_a26 = LD_DP(a + 26);
|
||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
|
||||
src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
|
||||
src_a18 = __msa_cast_to_vector_double(*(a + 18));
|
||||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
|
||||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18));
|
||||
|
||||
res_c3 -= res_c7 * src_a59;
|
||||
res_c2 -= res_c7 * src_a58;
|
||||
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
src_a8 = LD_DP(a + 8);
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
|
||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
|
||||
src_a0 = __msa_cast_to_vector_double(*(a + 0));
|
||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
|
||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
|
||||
|
||||
res_c1 -= res_c2 * src_a17;
|
||||
res_c1 *= src_a9;
|
||||
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a52 = LD_DP(a - 12);
|
||||
src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
|
||||
src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
|
||||
src_a54 = __msa_cast_to_vector_double(*(a - 10));
|
||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
|
||||
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10));
|
||||
|
||||
src_a40 = LD_DP(a - 24);
|
||||
src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
|
||||
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a34 = LD_DP(a - 30);
|
||||
src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
|
||||
src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
|
||||
src_a36 = __msa_cast_to_vector_double(*(a - 28));
|
||||
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
|
||||
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28));
|
||||
|
||||
res_c4 *= src_a36;
|
||||
res_c3 -= res_c4 * src_a35;
|
||||
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a16 = LD_DP(a - 48);
|
||||
src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
|
||||
src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
|
||||
src_a18 = __msa_cast_to_vector_double(*(a - 46));
|
||||
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
|
||||
src_a0 = __msa_cast_to_vector_double(*(a - 64));
|
||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
|
||||
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46));
|
||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64));
|
||||
src_a8 = LD_DP(a - 56);
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
|
||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
|
||||
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
|
||||
|
||||
src_a8 = __msa_cast_to_vector_double(*(a + 8));
|
||||
src_a0 = __msa_cast_to_vector_double(*(a + 0));
|
||||
|
||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
|
||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
|
||||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
|
||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
|
||||
|
||||
src_a4 = LD_DP(a + 4);
|
||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
|
||||
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
|
||||
|
||||
src_a8 = __msa_cast_to_vector_double(*(a + 8));
|
||||
src_a0 = __msa_cast_to_vector_double(*(a + 0));
|
||||
|
||||
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
|
||||
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
|
||||
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
|
||||
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
|
||||
|
||||
src_a4 = LD_DP(a + 4);
|
||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
|
||||
|
||||
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
res_c14 -= res_c8 * src_a6;
|
||||
res_c15 -= res_c8 * src_a7;
|
||||
|
||||
src_a9 = __msa_cast_to_vector_double(*(a + 9));
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
|
||||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
|
||||
src_a10 = LD_DP(a + 10);
|
||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
|
||||
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
res_c14 -= res_c10 * src_a22;
|
||||
res_c15 -= res_c10 * src_a23;
|
||||
|
||||
src_a27 = __msa_cast_to_vector_double(*(a + 27));
|
||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
|
||||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
|
||||
src_a28 = LD_DP(a + 28);
|
||||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
|
||||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
|
||||
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
res_c14 -= res_c12 * src_a38;
|
||||
res_c15 -= res_c12 * src_a39;
|
||||
|
||||
src_a45 = __msa_cast_to_vector_double(*(a + 45));
|
||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
|
||||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
|
||||
src_a46 = LD_DP(a + 46);
|
||||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
|
||||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
|
||||
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
|
||||
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
|
||||
|
||||
src_a63 = __msa_cast_to_vector_double(*(a + 63));
|
||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
|
||||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
|
||||
src_a54 = LD_DP(a + 54);
|
||||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
|
||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
|
||||
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c6 -= res_c0 * src_a6;
|
||||
res_c7 -= res_c0 * src_a7;
|
||||
|
||||
src_a9 = __msa_cast_to_vector_double(*(a + 9));
|
||||
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
|
||||
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
|
||||
src_a10 = LD_DP(a + 10);
|
||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
|
||||
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c6 -= res_c2 * src_a22;
|
||||
res_c7 -= res_c2 * src_a23;
|
||||
|
||||
src_a27 = __msa_cast_to_vector_double(*(a + 27));
|
||||
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
|
||||
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
|
||||
src_a28 = LD_DP(a + 28);
|
||||
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
|
||||
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
|
||||
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c6 -= res_c4 * src_a38;
|
||||
res_c7 -= res_c4 * src_a39;
|
||||
|
||||
src_a45 = __msa_cast_to_vector_double(*(a + 45));
|
||||
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
|
||||
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
|
||||
src_a46 = LD_DP(a + 46);
|
||||
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
|
||||
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
|
||||
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c6 -= res_c5 * src_a46;
|
||||
res_c7 -= res_c5 * src_a47;
|
||||
|
||||
src_a63 = __msa_cast_to_vector_double(*(a + 63));
|
||||
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
|
||||
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
|
||||
src_a54 = LD_DP(a + 54);
|
||||
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
|
||||
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
|
||||
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c6 -= res_c4 * src_a2;
|
||||
res_c7 -= res_c4 * src_a3;
|
||||
|
||||
src_a5 = __msa_cast_to_vector_double(*(a + 5));
|
||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
|
||||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
|
||||
src_a6 = LD_DP(a + 6);
|
||||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
|
||||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
|
||||
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a10 = LD_DP(a + 10);
|
||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
|
||||
src_a15 = __msa_cast_to_vector_double(*(a + 15));
|
||||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
|
||||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
|
||||
|
||||
res_c2 *= src_a10;
|
||||
res_c3 -= res_c2 * src_a11;
|
||||
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
res_c2 -= res_c0 * src_a2;
|
||||
res_c3 -= res_c0 * src_a3;
|
||||
|
||||
src_a5 = __msa_cast_to_vector_double(*(a + 5));
|
||||
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
|
||||
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
|
||||
src_a6 = LD_DP(a + 6);
|
||||
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
|
||||
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
|
||||
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_a10 = LD_DP(a + 10);
|
||||
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
|
||||
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
|
||||
src_a15 = __msa_cast_to_vector_double(*(a + 15));
|
||||
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
|
||||
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
|
||||
|
||||
res_c2 *= src_a10;
|
||||
res_c3 -= res_c2 * src_a11;
|
||||
|
||||
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
src_b2 = LD_DP(b + 2);
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
|
||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
|
||||
src_b5 = __msa_cast_to_vector_double(*(b + 5));
|
||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
|
||||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
|
||||
src_b6 = LD_DP(b + 6);
|
||||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
|
||||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
|
||||
src_b10 = LD_DP(b + 10);
|
||||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
|
||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
|
||||
src_b15 = __msa_cast_to_vector_double(*(b + 15));
|
||||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
|
||||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_b0 = LD_DP(b + 0);
|
||||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b3 = __msa_cast_to_vector_double(*(b + 3));
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
|
||||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
||||
}
|
||||
}
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*b);
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_b2 = LD_DP(b + 2);
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
|
||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
|
||||
src_b5 = __msa_cast_to_vector_double(*(b + 5));
|
||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
|
||||
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
|
||||
src_b6 = LD_DP(b + 6);
|
||||
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
|
||||
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
|
||||
src_b10 = LD_DP(b + 10);
|
||||
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
|
||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
|
||||
src_b15 = __msa_cast_to_vector_double(*(b + 15));
|
||||
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
|
||||
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_b0 = LD_DP(b + 0);
|
||||
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b3 = __msa_cast_to_vector_double(*(b + 3));
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
|
||||
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
|
||||
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
|
||||
src_b8 = LD_DP(b + 8);
|
||||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
|
||||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
|
||||
src_b10 = __msa_cast_to_vector_double(*(b + 10));
|
||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
|
||||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*(b + 0));
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
|
||||
src_b4 = LD_DP(b + 4);
|
||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
|
||||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
|
||||
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
a -= 16;
|
||||
b -= 4;
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*(b + 0));
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
|
||||
src_b2 = LD_DP(b + 2);
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
|
||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
|
||||
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
||||
a -= 8;
|
||||
b -= 1;
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*b);
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
src_b8 = LD_DP(b + 8);
|
||||
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
|
||||
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
|
||||
src_b10 = __msa_cast_to_vector_double(*(b + 10));
|
||||
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
|
||||
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*(b + 0));
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
|
||||
src_b4 = LD_DP(b + 4);
|
||||
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
|
||||
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
|
||||
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
||||
a -= 8;
|
||||
b -= 4;
|
||||
|
||||
src_b0 = __msa_cast_to_vector_double(*(b + 0));
|
||||
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
|
||||
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
|
||||
src_b2 = LD_DP(b + 2);
|
||||
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
|
||||
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
|
||||
|
||||
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src)
|
||||
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
|
||||
|
||||
#define COPY_FLOAT_TO_VECTOR(a) ( { \
|
||||
v4f32 out; \
|
||||
out = __msa_cast_to_vector_float(a); \
|
||||
out = (v4f32) __msa_splati_w((v4i32) out, 0); \
|
||||
v4f32 out = {a, a, a, a}; \
|
||||
out; \
|
||||
} )
|
||||
|
||||
#define COPY_DOUBLE_TO_VECTOR(a) ( { \
|
||||
v2f64 out; \
|
||||
out = __msa_cast_to_vector_double(a); \
|
||||
out = (v2f64) __msa_splati_d((v2i64) out, 0); \
|
||||
v2f64 out = {a, a}; \
|
||||
out; \
|
||||
} )
|
||||
|
||||
|
||||
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
{
|
||||
if ((0 == c) && (0 == s))
|
||||
{
|
||||
v4f32 zero = __msa_cast_to_vector_float(0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
|
||||
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
|
||||
v4f32 zero = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
/* process 4 floats */
|
||||
for (j = (n >> 2); j--;)
|
||||
|
||||
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
{
|
||||
if (0.0 == da)
|
||||
{
|
||||
v4f32 zero_v = __msa_cast_to_vector_float(0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
|
||||
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
for (i = (n >> 6); i--;)
|
||||
{
|
||||
|
||||
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else if ((inc_x != 0) && (inc_y != 0))
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
@@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (inc_x == inc_y)
|
||||
{
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = *srcx;
|
||||
*srcx = *srcy;
|
||||
*srcy = x0;
|
||||
}
|
||||
else
|
||||
return (0);
|
||||
}
|
||||
else
|
||||
{
|
||||
BLASLONG ix = 0, iy = 0;
|
||||
while (i < n)
|
||||
{
|
||||
x0 = srcx[ix];
|
||||
srcx[ix] = srcy[iy];
|
||||
srcy[iy] = x0;
|
||||
ix += inc_x;
|
||||
iy += inc_y;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define ZGEMV_T_8x1() \
|
||||
|
||||
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
{
|
||||
if ((0.0 == da_r) && (0.0 == da_i))
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
v2f64 zero_v = {0.0, 0.0};
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
|
||||
if ((0.0 == da_r) && (0.0 == da_i))
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
v2f64 zero_v = {0.0, 0.0};
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
|
||||
@@ -1,64 +0,0 @@
|
||||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DSDOTKERNEL = ../mips/dot.c
|
||||
|
||||
|
||||
|
||||
192
kernel/mips64/KERNEL.LOONGSON3R4
Normal file
192
kernel/mips64/KERNEL.LOONGSON3R4
Normal file
@@ -0,0 +1,192 @@
|
||||
ifdef HAVE_MSA
|
||||
SAXPYKERNEL = ../mips/saxpy_msa.c
|
||||
DAXPYKERNEL = ../mips/daxpy_msa.c
|
||||
CAXPYKERNEL = ../mips/caxpy_msa.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy_msa.c
|
||||
else
|
||||
SAXPYKERNEL = axpy_loongson3a.S
|
||||
DAXPYKERNEL = daxpy_loongson3a_simd.S
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SCOPYKERNEL = ../mips/scopy_msa.c
|
||||
DCOPYKERNEL = ../mips/dcopy_msa.c
|
||||
CCOPYKERNEL = ../mips/ccopy_msa.c
|
||||
ZCOPYKERNEL = ../mips/zcopy_msa.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SDOTKERNEL = ../mips/sdot_msa.c
|
||||
DDOTKERNEL = ../mips/ddot_msa.c
|
||||
CDOTKERNEL = ../mips/cdot_msa.c
|
||||
ZDOTKERNEL = ../mips/zdot_msa.c
|
||||
endif
|
||||
DSDOTKERNEL = ../mips/dot.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SROTKERNEL = ../mips/srot_msa.c
|
||||
DROTKERNEL = ../mips/drot_msa.c
|
||||
CROTKERNEL = ../mips/crot_msa.c
|
||||
ZROTKERNEL = ../mips/zrot_msa.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SSCALKERNEL = ../mips/sscal_msa.c
|
||||
DSCALKERNEL = ../mips/dscal_msa.c
|
||||
CSCALKERNEL = ../mips/cscal_msa.c
|
||||
ZSCALKERNEL = ../mips/zscal_msa.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
|
||||
DGEMVNKERNEL = ../mips/dgemv_n_msa.c
|
||||
SGEMVTKERNEL = ../mips/sgemv_t_msa.c
|
||||
DGEMVTKERNEL = ../mips/dgemv_t_msa.c
|
||||
CGEMVNKERNEL = ../mips/cgemv_n_msa.c
|
||||
CGEMVTKERNEL = ../mips/cgemv_t_msa.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
|
||||
else
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SASUMKERNEL = ../mips/sasum_msa.c
|
||||
DASUMKERNEL = ../mips/dasum_msa.c
|
||||
CASUMKERNEL = ../mips/casum_msa.c
|
||||
ZASUMKERNEL = ../mips/zasum_msa.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SSWAPKERNEL = ../mips/sswap_msa.c
|
||||
DSWAPKERNEL = ../mips/dswap_msa.c
|
||||
CSWAPKERNEL = ../mips/cswap_msa.c
|
||||
ZSWAPKERNEL = ../mips/zswap_msa.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
|
||||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
|
||||
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
|
||||
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c
|
||||
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c
|
||||
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c
|
||||
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
|
||||
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
|
||||
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
|
||||
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
|
||||
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
|
||||
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
|
||||
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
|
||||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
|
||||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
|
||||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
|
||||
else
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
||||
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
|
||||
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
|
||||
else
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
else
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
else
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
@@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_power10.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_power10.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_power10.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_power10.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_power10.c
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT_power10.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_power10.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_power10.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
@@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c
|
||||
SDOTKERNEL = sdot_power10.c
|
||||
DDOTKERNEL = ddot_power10.c
|
||||
DSDOTKERNEL = sdot_power10.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
CDOTKERNEL = cdot.c
|
||||
endif
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
@@ -173,8 +169,13 @@ ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
else
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
endif
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
|
||||
@@ -242,8 +242,13 @@ ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
else
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
endif
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
|
||||
@@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = trsm_kernel_LN_power10.c
|
||||
STRSMKERNEL_LT = trsm_kernel_LT_power10.c
|
||||
STRSMKERNEL_RN = trsm_kernel_RN_power10.c
|
||||
STRSMKERNEL_RT = trsm_kernel_RT_power10.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN_power10.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
DTRSMKERNEL_RN = trsm_kernel_RN_power10.c
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT_power10.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
@@ -166,8 +166,13 @@ ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
else
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
endif
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
|
||||
@@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
||||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%4) \n\t"
|
||||
"stxvp 50, 32(%4) \n\t"
|
||||
"stxvp 34, 64(%4) \n\t"
|
||||
"stxvp 38, 96(%4) \n\t"
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 35, 64(%4) \n\t"
|
||||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
|
||||
@@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
|
||||
"xvmaddasp 38, 58, 33 \n\t"
|
||||
"xvmaddasp 39, 59, 33 \n\t"
|
||||
|
||||
"stxvp 48, 0(%4) \n\t"
|
||||
"stxvp 50, 32(%4) \n\t"
|
||||
"stxvp 34, 64(%4) \n\t"
|
||||
"stxvp 38, 96(%4) \n\t"
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 35, 64(%4) \n\t"
|
||||
"stxv 34, 80(%4) \n\t"
|
||||
"stxv 39, 96(%4) \n\t"
|
||||
"stxv 38, 112(%4) \n\t"
|
||||
|
||||
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
|
||||
:
|
||||
|
||||
115
kernel/power/ccopy_microk_power10.c
Normal file
115
kernel/power/ccopy_microk_power10.c
Normal file
@@ -0,0 +1,115 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL 1
|
||||
|
||||
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 256 \n\t"
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
|
||||
"addi %3, %3, 256 \n\t"
|
||||
"addi %2, %2, 256 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
}
|
||||
@@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "copy_microk_power10.c"
|
||||
#include "ccopy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL
|
||||
@@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
|
||||
@@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#else
|
||||
|
||||
#include "common.h"
|
||||
#if defined(POWER10)
|
||||
#include "cdot_microk_power10.c"
|
||||
#else
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
|
||||
@@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
@@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
#if defined(POWER10)
|
||||
BLASLONG n1 = n & -16;
|
||||
#else
|
||||
BLASLONG n1 = n & -8;
|
||||
#endif
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1){
|
||||
|
||||
177
kernel/power/cdot_microk_power10.c
Normal file
177
kernel/power/cdot_microk_power10.c
Normal file
@@ -0,0 +1,177 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
||||
{
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"xxperm 56, 48, %x7 \n\t"
|
||||
"xxperm 57, 49, %x7 \n\t"
|
||||
"xxperm 58, 50, %x7 \n\t"
|
||||
"xxperm 59, 51, %x7 \n\t"
|
||||
|
||||
"xxperm 60, 52, %x7 \n\t"
|
||||
"xxperm 61, 53, %x7 \n\t"
|
||||
"xxperm 62, 54, %x7 \n\t"
|
||||
"xxperm 63, 55, %x7 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
|
||||
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
|
||||
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
|
||||
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
||||
"xxperm 56, 48, %x7 \n\t"
|
||||
"xxperm 57, 49, %x7 \n\t"
|
||||
"xxperm 58, 50, %x7 \n\t"
|
||||
"xxperm 59, 51, %x7 \n\t"
|
||||
|
||||
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
|
||||
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"xxperm 60, 52, %x7 \n\t"
|
||||
"xxperm 61, 53, %x7 \n\t"
|
||||
"xxperm 62, 54, %x7 \n\t"
|
||||
"xxperm 63, 55, %x7 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 33, 33, 35 \n\t"
|
||||
"xvaddsp 37, 37, 39 \n\t"
|
||||
|
||||
"xvaddsp 35, 32, 36 \n\t"
|
||||
"xvaddsp 34, 33, 37 \n\t"
|
||||
"xxswapd 32, 35 \n\t"
|
||||
"xxswapd 33, 34 \n\t"
|
||||
"xvaddsp 35, 35, 32 \n\t"
|
||||
"xvaddsp 34, 34, 33 \n\t"
|
||||
"xxpermdi 34, 34, 35, 2 \n\t"
|
||||
"stxv 34, 0(%6) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
|
||||
:
|
||||
"=m" (*dot),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*y),
|
||||
"b" (dot), // 6
|
||||
"wa" (mask)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
||||
@@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
|
||||
"addi %3, %3, 512 \n\t"
|
||||
|
||||
176
kernel/power/cscal_microk_power10.c
Normal file
176
kernel/power/cscal_microk_power10.c
Normal file
@@ -0,0 +1,176 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
||||
{
|
||||
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"xscvdpspn 32, %x3 \n\t"
|
||||
"xxspltw 32, 32, 0 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
|
||||
"xxperm 34, 40, %x5 \n\t"
|
||||
"xxperm 35, 41, %x5 \n\t"
|
||||
"xxperm 36, 42, %x5 \n\t"
|
||||
"xxperm 37, 43, %x5 \n\t"
|
||||
"xxperm 38, 44, %x5 \n\t"
|
||||
"xxperm 39, 45, %x5 \n\t"
|
||||
"xxperm 56, 46, %x5 \n\t"
|
||||
"xxperm 57, 47, %x5 \n\t"
|
||||
|
||||
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmulsp 35, 35, %x4 \n\t"
|
||||
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
|
||||
"xvmulsp 36, 36, %x4 \n\t"
|
||||
"xvmulsp 37, 37, %x4 \n\t"
|
||||
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
|
||||
"xvmulsp 38, 38, %x4 \n\t"
|
||||
"xvmulsp 39, 39, %x4 \n\t"
|
||||
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
|
||||
"xvmulsp 56, 56, %x4 \n\t"
|
||||
"xvmulsp 57, 57, %x4 \n\t"
|
||||
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"xvaddsp 48, 48, 34 \n\t"
|
||||
"xvaddsp 49, 49, 35 \n\t"
|
||||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
|
||||
"xxperm 34, 40, %x5 \n\t"
|
||||
"xxperm 35, 41, %x5 \n\t"
|
||||
"xxperm 36, 42, %x5 \n\t"
|
||||
"xxperm 37, 43, %x5 \n\t"
|
||||
"xxperm 38, 44, %x5 \n\t"
|
||||
"xxperm 39, 45, %x5 \n\t"
|
||||
"xxperm 56, 46, %x5 \n\t"
|
||||
"xxperm 57, 47, %x5 \n\t"
|
||||
|
||||
|
||||
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmulsp 35, 35, %x4 \n\t"
|
||||
"xvmulsp 36, 36, %x4 \n\t"
|
||||
"xvmulsp 37, 37, %x4 \n\t"
|
||||
"xvmulsp 38, 38, %x4 \n\t"
|
||||
"xvmulsp 39, 39, %x4 \n\t"
|
||||
"xvmulsp 56, 56, %x4 \n\t"
|
||||
"xvmulsp 57, 57, %x4 \n\t"
|
||||
|
||||
"xvaddsp 48, 48, 34 \n\t"
|
||||
"xvaddsp 49, 49, 35 \n\t"
|
||||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
|
||||
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
"f" (alpha_r), // 3
|
||||
"wa" (t0), // 4
|
||||
"wa" (mask) // 5
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57"
|
||||
);
|
||||
}
|
||||
@@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "cswap_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
127
kernel/power/cswap_microk_power10.c
Normal file
127
kernel/power/cswap_microk_power10.c
Normal file
@@ -0,0 +1,127 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void zswap_kernel_16 (long n, double *x, double *y)
|
||||
#else
|
||||
#define HAVE_KERNEL_32 1
|
||||
static void cswap_kernel_32 (long n, float *x, float *y)
|
||||
#endif
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
"lxvp 32, 0(%4) \n\t"
|
||||
"lxvp 34, 32(%4) \n\t"
|
||||
"lxvp 36, 64(%4) \n\t"
|
||||
"lxvp 38, 96(%4) \n\t"
|
||||
|
||||
"lxvp 40, 128(%4) \n\t"
|
||||
"lxvp 42, 160(%4) \n\t"
|
||||
"lxvp 44, 192(%4) \n\t"
|
||||
"lxvp 46, 224(%4) \n\t"
|
||||
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"lxvp 56, 128(%3) \n\t"
|
||||
"lxvp 58, 160(%3) \n\t"
|
||||
"lxvp 60, 192(%3) \n\t"
|
||||
"lxvp 62, 224(%3) \n\t"
|
||||
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxv 41, 0(%3) \n\t"
|
||||
"stxv 40, 16(%3) \n\t"
|
||||
"stxv 43, 32(%3) \n\t"
|
||||
"stxv 42, 48(%3) \n\t"
|
||||
"stxv 45, 64(%3) \n\t"
|
||||
"stxv 44, 80(%3) \n\t"
|
||||
"stxv 47, 96(%3) \n\t"
|
||||
"stxv 46, 112(%3) \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 53, 64(%4) \n\t"
|
||||
"stxv 52, 80(%4) \n\t"
|
||||
"stxv 55, 96(%4) \n\t"
|
||||
"stxv 54, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxv 57, 0(%4) \n\t"
|
||||
"stxv 56, 16(%4) \n\t"
|
||||
"stxv 59, 32(%4) \n\t"
|
||||
"stxv 58, 48(%4) \n\t"
|
||||
"stxv 61, 64(%4) \n\t"
|
||||
"stxv 60, 80(%4) \n\t"
|
||||
"stxv 63, 96(%4) \n\t"
|
||||
"stxv 62, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
"addic. %2, %2, -16 \n\t"
|
||||
#else
|
||||
"addic. %2, %2, -32 \n\t"
|
||||
#endif
|
||||
"bgt one%= \n"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
||||
@@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dasum_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
sumf += ABS(x[i]);
|
||||
}
|
||||
}
|
||||
n1 = (n-i) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sumf += dasum_kernel_16(n1, &x[i]);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
@@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
sumf = dasum_kernel_16(n1, x);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
152
kernel/power/dasum_microk_power10.c
Normal file
152
kernel/power/dasum_microk_power10.c
Normal file
@@ -0,0 +1,152 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static double dasum_kernel_16 (long n, double *x)
|
||||
{
|
||||
double sum;
|
||||
__vector double t0;
|
||||
__vector double t1;
|
||||
__vector double t2;
|
||||
__vector double t3;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
|
||||
|
||||
"xvabsdp %x3, 44 \n\t"
|
||||
"xvabsdp %x4, 45 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
||||
|
||||
"xvabsdp %x5, 46 \n\t"
|
||||
"xvabsdp %x6, 47 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvadddp 36, 36, %x3 \n\t"
|
||||
"xvadddp 37, 37, %x4 \n\t"
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"xvadddp 38, 38, %x5 \n\t"
|
||||
"xvadddp 39, 39, %x6 \n\t"
|
||||
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"xvabsdp %x3, 44 \n\t"
|
||||
"xvabsdp %x4, 45 \n\t"
|
||||
"xvabsdp %x5, 46 \n\t"
|
||||
"xvabsdp %x6, 47 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"xvadddp 36, 36, %x3 \n\t"
|
||||
"xvadddp 37, 37, %x4 \n\t"
|
||||
"xvadddp 38, 38, %x5 \n\t"
|
||||
"xvadddp 39, 39, %x6 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
XXSWAPD_S(33,32)
|
||||
"xsadddp %x0, 32, 33 \n"
|
||||
|
||||
"#n=%1 x=%3=%2 sum=%0\n"
|
||||
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
|
||||
:
|
||||
"=d" (sum), // 0
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"=wa" (t0), // 3
|
||||
"=wa" (t1), // 4
|
||||
"=wa" (t2), // 5
|
||||
"=wa" (t3) // 6
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
||||
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] += da * x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -16;
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, &x[i], &y[i], da);
|
||||
|
||||
if ( n1 )
|
||||
daxpy_kernel_8(n1, x, y, da);
|
||||
i += n1;
|
||||
|
||||
i = n1;
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
|
||||
@@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
if ( n >= 64 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
i=n1;
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] = x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -64;
|
||||
if ( n1 )
|
||||
{
|
||||
copy_kernel(n1, &x[i], &y[i]);
|
||||
i += n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
|
||||
@@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_disassemble_pair)
|
||||
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
@@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
@@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
@@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
@@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
@@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
}
|
||||
@@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
@@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
@@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
}
|
||||
@@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
@@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
@@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
@@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
@@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
||||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
|
||||
@@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "drot_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "drot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
drot_kernel_16(n1,&x[i], &y[i], c, s);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
drot_kernel_16(n1, x1, y1, c, s);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
148
kernel/power/drot_microk_power10.c
Normal file
148
kernel/power/drot_microk_power10.c
Normal file
@@ -0,0 +1,148 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
XXSPLTD_S(36,%x5,0) // load c to both dwords
|
||||
XXSPLTD_S(37,%x6,0) // load s to both dwords
|
||||
"lxvp 32, 0(%3) \n\t" // load x
|
||||
"lxvp 34, 32(%3) \n\t"
|
||||
"lxvp 48, 0(%4) \n\t" // load y
|
||||
"lxvp 50, 32(%4) \n\t"
|
||||
|
||||
"addic. %2, %2, -8 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvp 32, 64(%3) \n\t" // load x
|
||||
"lxvp 34, 96(%3) \n\t"
|
||||
"xvmuldp 52, 48, 36 \n\t" // c * y
|
||||
"xvmuldp 53, 49, 36 \n\t"
|
||||
"xvmuldp 54, 50, 36 \n\t"
|
||||
"xvmuldp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 38, 48, 37 \n\t" // s * y
|
||||
"xvmuldp 39, 49, 37 \n\t"
|
||||
"xvmuldp 56, 50, 37 \n\t"
|
||||
"xvmuldp 57, 51, 37 \n\t"
|
||||
|
||||
"lxvp 48, 64(%4) \n\t" // load y
|
||||
"lxvp 50, 96(%4) \n\t"
|
||||
|
||||
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
|
||||
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"addi %4, %4, 64 \n\t"
|
||||
|
||||
"addic. %2, %2, -8 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 52, 48, 36 \n\t" // c * y
|
||||
"xvmuldp 53, 49, 36 \n\t"
|
||||
"xvmuldp 54, 50, 36 \n\t"
|
||||
"xvmuldp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"xvmuldp 38, 48, 37 \n\t" // s * y
|
||||
"xvmuldp 39, 49, 37 \n\t"
|
||||
"xvmuldp 56, 50, 37 \n\t"
|
||||
"xvmuldp 57, 51, 37 \n\t"
|
||||
|
||||
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
"d" (c), // 5
|
||||
"d" (s) // 6
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57"
|
||||
);
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user