Compare commits
199 Commits
revert-146
...
revert-160
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
de8fff671d | ||
|
|
6f71c0fce4 | ||
|
|
3313e4b946 | ||
|
|
e9cd11768c | ||
|
|
0297b3211a | ||
|
|
66316b9f4c | ||
|
|
6adc4b7b36 | ||
|
|
2ade0ef085 | ||
|
|
e8880c1699 | ||
|
|
ed7c4a043b | ||
|
|
cf234a0561 | ||
|
|
ae2a33128b | ||
|
|
e4718b1fee | ||
|
|
9b87b64262 | ||
|
|
0218b884c1 | ||
|
|
83da278093 | ||
|
|
358d4df2bd | ||
|
|
06d43760e4 | ||
|
|
a4af8861ff | ||
|
|
7fb62aed7e | ||
|
|
f6021c798d | ||
|
|
e8002536ec | ||
|
|
ce6317f6c0 | ||
|
|
15a78d6b66 | ||
|
|
354a976a59 | ||
|
|
38ad05bd04 | ||
|
|
b7feded85a | ||
|
|
dc9fe05ab5 | ||
|
|
8be027e4c6 | ||
|
|
ac7b6e3e9a | ||
|
|
fc66a0ec0b | ||
|
|
89372e0993 | ||
|
|
ef626c6824 | ||
|
|
83fec56a3f | ||
|
|
5a51cf4576 | ||
|
|
5a92b311e0 | ||
|
|
a7d0f49cec | ||
|
|
f1fb9a4745 | ||
|
|
0023515733 | ||
|
|
99c7bba8e4 | ||
|
|
36c4523d85 | ||
|
|
a8002e283a | ||
|
|
401adddb2b | ||
|
|
c5b13d4e10 | ||
|
|
677e42d7b0 | ||
|
|
e2a8c35e5a | ||
|
|
1a49fb1c05 | ||
|
|
8562d5787a | ||
|
|
93f1eb09c3 | ||
|
|
c90bbda3df | ||
|
|
7df8c4f76f | ||
|
|
2fc748bf72 | ||
|
|
a91f1587b9 | ||
|
|
d1b7be14aa | ||
|
|
b491b10057 | ||
|
|
5fae96fb70 | ||
|
|
a7dbd4c57d | ||
|
|
2cae104b5e | ||
|
|
908d40be71 | ||
|
|
43e592ceb3 | ||
|
|
f0f27868d8 | ||
|
|
961d25e9c7 | ||
|
|
f5959f2543 | ||
|
|
82012b960b | ||
|
|
8dd3515fa2 | ||
|
|
95f7f0229c | ||
|
|
5082fe4306 | ||
|
|
7a7619af6d | ||
|
|
9a400b7014 | ||
|
|
893b535540 | ||
|
|
6791294312 | ||
|
|
ddb8b124de | ||
|
|
191746c493 | ||
|
|
eb9b021d38 | ||
|
|
7d7564568c | ||
|
|
a07843bc93 | ||
|
|
41ae8e8d67 | ||
|
|
9c1aa0b0fe | ||
|
|
53457f222f | ||
|
|
458e3af5b1 | ||
|
|
3716267124 | ||
|
|
50acc40613 | ||
|
|
c720f1f019 | ||
|
|
d7d950fcf2 | ||
|
|
12398e53ce | ||
|
|
193f835662 | ||
|
|
7e3151ead7 | ||
|
|
e3a069f108 | ||
|
|
6fff8c626a | ||
|
|
d2b9389f1b | ||
|
|
65b8a5c5d8 | ||
|
|
9795adc7ef | ||
|
|
1a8e487c4a | ||
|
|
5966fd52a2 | ||
|
|
dbafe6357b | ||
|
|
71051259e0 | ||
|
|
73cc321190 | ||
|
|
018f2dad27 | ||
|
|
9d5098dbc9 | ||
|
|
d94d7baf7e | ||
|
|
3af1b5c805 | ||
|
|
88e224f4c0 | ||
|
|
d0c0506588 | ||
|
|
e93355e5e1 | ||
|
|
c1eb06e102 | ||
|
|
8145ecd70b | ||
|
|
26ce518d46 | ||
|
|
1d27fa8507 | ||
|
|
802cf6b22d | ||
|
|
894433a7c7 | ||
|
|
1b83341d19 | ||
|
|
954f1832de | ||
|
|
941ad280a8 | ||
|
|
a8ed428bab | ||
|
|
1da365312a | ||
|
|
2d0929fa7c | ||
|
|
125343cc88 | ||
|
|
8a3b6fa108 | ||
|
|
78694f1b7e | ||
|
|
9c5518319a | ||
|
|
86f49c529d | ||
|
|
625c74a38f | ||
|
|
5fcaca6438 | ||
|
|
4fcdd24459 | ||
|
|
68a3c4fca6 | ||
|
|
0c4718c57a | ||
|
|
f29389c7ac | ||
|
|
734d7c6a93 | ||
|
|
7c861605b2 | ||
|
|
2ca0faf495 | ||
|
|
0fe434598b | ||
|
|
15c437e092 | ||
|
|
b966bd79d5 | ||
|
|
2e988dbf35 | ||
|
|
be6090d396 | ||
|
|
daae8fd197 | ||
|
|
20c6c38e51 | ||
|
|
a1fb7670f7 | ||
|
|
6c99c97489 | ||
|
|
6a0930560e | ||
|
|
24f8d5b624 | ||
|
|
77b4dbd53b | ||
|
|
bc4c3bca01 | ||
|
|
6b0a9d135c | ||
|
|
137ccd9dd9 | ||
|
|
84923dedb7 | ||
|
|
8ec28ff461 | ||
|
|
ca8ca796d3 | ||
|
|
8f811a9312 | ||
|
|
36a17536ca | ||
|
|
bb9876db33 | ||
|
|
d636b418af | ||
|
|
a460c92577 | ||
|
|
33f838393c | ||
|
|
a41d241a0e | ||
|
|
8da6b6ae52 | ||
|
|
01c4b82f04 | ||
|
|
93db123f7e | ||
|
|
752fdb5dd8 | ||
|
|
07ed01e97f | ||
|
|
35c5a32309 | ||
|
|
c7b55b6082 | ||
|
|
840e01061f | ||
|
|
28ca97015d | ||
|
|
73c5ca74fa | ||
|
|
e453555d97 | ||
|
|
6a6ffaff1e | ||
|
|
28ac9ea5a6 | ||
|
|
a55694dd5b | ||
|
|
85a41e9cdb | ||
|
|
40160ff3c1 | ||
|
|
6a99fcce94 | ||
|
|
2c7392f07b | ||
|
|
81215711a2 | ||
|
|
809fd0d451 | ||
|
|
72e65157df | ||
|
|
69a8aa6de2 | ||
|
|
0ab5bf1746 | ||
|
|
22167170b3 | ||
|
|
69d9f36ff4 | ||
|
|
f81815e48a | ||
|
|
5f855d965d | ||
|
|
fa9ca65c0e | ||
|
|
719b68f077 | ||
|
|
fe9f15f2d8 | ||
|
|
497f0c3d8a | ||
|
|
ea37db828e | ||
|
|
e6a0a3de73 | ||
|
|
6e70287776 | ||
|
|
58f236ad73 | ||
|
|
e207107150 | ||
|
|
c9d408064a | ||
|
|
288d1a3f6e | ||
|
|
7c1925acec | ||
|
|
2359c7c1a9 | ||
|
|
7646974227 | ||
|
|
e3a80e6aa8 | ||
|
|
2c0a008281 | ||
|
|
c5425daa6b |
23
.travis.yml
23
.travis.yml
@@ -7,6 +7,7 @@ language: c
|
||||
jobs:
|
||||
include:
|
||||
- &test-ubuntu
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
@@ -57,7 +58,8 @@ jobs:
|
||||
- TARGET_BOX=LINUX32
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
- stage: test
|
||||
- os: linux
|
||||
stage: test
|
||||
compiler: gcc
|
||||
addons:
|
||||
apt:
|
||||
@@ -77,6 +79,7 @@ jobs:
|
||||
# which is slower than container-based infrastructure used for jobs
|
||||
# that don't require sudo.
|
||||
- &test-alpine
|
||||
os: linux
|
||||
stage: test
|
||||
dist: trusty
|
||||
sudo: true
|
||||
@@ -120,6 +123,7 @@ jobs:
|
||||
- BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
|
||||
|
||||
- &test-cmake
|
||||
os: linux
|
||||
stage: test
|
||||
compiler: clang
|
||||
addons:
|
||||
@@ -147,6 +151,23 @@ jobs:
|
||||
env:
|
||||
- CMAKE=1
|
||||
|
||||
- &test-macos
|
||||
os: osx
|
||||
stage: test
|
||||
osx_image: xcode8
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
|
||||
- <<: *test-macos
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
only:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 0.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 1.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
||||
15
Makefile
15
Makefile
@@ -91,11 +91,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@$(MAKE) -C exports so
|
||||
@ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
@@ -298,9 +294,10 @@ endif
|
||||
|
||||
lapack-test :
|
||||
(cd $(NETLIB_LAPACK_DIR)/TESTING && rm -f x* *.out)
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING xeigtstc xeigtstd xeigtsts xeigtstz xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/EIG xeigtstc xeigtstd xeigtsts xeigtstz
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR)/TESTING/LIN xlintstc xlintstd xlintstds xlintstrfd xlintstrfz xlintsts xlintstz xlintstzc xlintstrfs xlintstrfc
|
||||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
endif
|
||||
@@ -312,9 +309,9 @@ lapack-runtest:
|
||||
|
||||
|
||||
blas-test:
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && rm -f x* *.out)
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && rm -f x* *.out)
|
||||
$(MAKE) -j 1 -C $(NETLIB_LAPACK_DIR) blas_testing
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS && cat *.out)
|
||||
(cd $(NETLIB_LAPACK_DIR)/BLAS/TESTING && cat *.out)
|
||||
|
||||
|
||||
dummy :
|
||||
|
||||
@@ -72,12 +72,7 @@ ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android))
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
endif
|
||||
ifeq ($(OSNAME), NetBSD)
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
@cp $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBSONAME) $(LIBPREFIX).so
|
||||
@@ -101,8 +96,9 @@ endif
|
||||
|
||||
#Generating openblas.pc
|
||||
@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
|
||||
@@ -115,7 +111,7 @@ endif
|
||||
|
||||
ifndef NO_SHARED
|
||||
#ifeq logical or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
|
||||
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
|
||||
endif
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),WINNT CYGWIN_NT))
|
||||
|
||||
@@ -17,6 +17,10 @@ ifdef CPUIDEMU
|
||||
EXFLAGS = -DCPUIDEMU -DVENDOR=99
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), 1004K)
|
||||
TARGET_FLAGS = -mips32r2
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET), P5600)
|
||||
TARGET_FLAGS = -mips32r5
|
||||
endif
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.0.dev
|
||||
VERSION = 0.3.1.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -60,6 +60,13 @@ VERSION = 0.3.0.dev
|
||||
# automatically detected by the the script.
|
||||
# NUM_THREADS = 24
|
||||
|
||||
# If you have enabled USE_OPENMP and your application would call
|
||||
# OpenBLAS's calculation API from multi threads, please comment it in.
|
||||
# This flag defines how many instances of OpenBLAS's calculation API can
|
||||
# actually run in parallel. If more threads call OpenBLAS's calculation API,
|
||||
# they need to wait for the preceding API calls to finish or risk data corruption.
|
||||
# NUM_PARALLEL = 2
|
||||
|
||||
# if you don't need to install the static library, please comment it in.
|
||||
# NO_STATIC = 1
|
||||
|
||||
@@ -100,7 +107,7 @@ BUILD_LAPACK_DEPRECATED = 1
|
||||
NO_WARMUP = 1
|
||||
|
||||
# If you want to disable CPU/Memory affinity on Linux.
|
||||
#NO_AFFINITY = 1
|
||||
NO_AFFINITY = 1
|
||||
|
||||
# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
|
||||
# BIGNUMA = 1
|
||||
|
||||
@@ -17,15 +17,24 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
# http://stackoverflow.com/questions/4029274/mingw-and-make-variables
|
||||
# - Default value is 'cc' which is not always a valid command (e.g. MinGW).
|
||||
ifeq ($(origin CC),default)
|
||||
|
||||
# Check if $(CC) refers to a valid command and set the value to gcc if not
|
||||
ifneq ($(findstring cmd.exe,$(SHELL)),)
|
||||
ifeq ($(shell where $(CC) 2>NUL),)
|
||||
CC = gcc
|
||||
# Change the default compile to clang on Mac OSX.
|
||||
# http://stackoverflow.com/questions/714100/os-detecting-makefile
|
||||
UNAME_S := $(shell uname -s)
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
endif
|
||||
endif
|
||||
else # POSIX-ish
|
||||
ifeq ($(shell command -v $(CC) 2>/dev/null),)
|
||||
ifeq ($(shell uname -s),Darwin)
|
||||
CC = clang
|
||||
# EXTRALIB += -Wl,-no_compact_unwind
|
||||
else
|
||||
CC = gcc
|
||||
endif # Darwin
|
||||
endif # CC exists
|
||||
endif # Shell is sane
|
||||
|
||||
endif # CC is set to default
|
||||
|
||||
# Default Fortran compiler (FC) is selected by f_check.
|
||||
|
||||
@@ -53,6 +62,9 @@ ifeq ($(BINARY), 32)
|
||||
ifeq ($(TARGET), HASWELL)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
@@ -86,6 +98,9 @@ ifeq ($(BINARY), 32)
|
||||
ifeq ($(TARGET_CORE), HASWELL)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||
endif
|
||||
@@ -132,6 +147,10 @@ ifeq ($(NO_AVX2), 1)
|
||||
GETARCH_FLAGS += -DNO_AVX2
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX512), 1)
|
||||
GETARCH_FLAGS += -DNO_AVX512
|
||||
endif
|
||||
|
||||
ifeq ($(DEBUG), 1)
|
||||
GETARCH_FLAGS += -g
|
||||
endif
|
||||
@@ -175,6 +194,10 @@ endif
|
||||
|
||||
endif
|
||||
|
||||
ifndef NUM_PARALLEL
|
||||
NUM_PARALLEL = 1
|
||||
endif
|
||||
|
||||
ifndef NUM_THREADS
|
||||
NUM_THREADS = $(NUM_CORES)
|
||||
endif
|
||||
@@ -230,7 +253,7 @@ endif
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), FreeBSD)
|
||||
ifneq (,$(findstring $(OSNAME), FreeBSD OpenBSD DragonFly))
|
||||
MD5SUM = md5 -r
|
||||
endif
|
||||
|
||||
@@ -424,7 +447,7 @@ CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), INTEL)
|
||||
CCOMMON_OPT += -openmp
|
||||
CCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
@@ -456,6 +479,11 @@ endif
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += HASWELL ZEN
|
||||
endif
|
||||
ifneq ($(NO_AVX512), 1)
|
||||
ifneq ($(NO_AVX2), 1)
|
||||
DYNAMIC_CORE += SKYLAKEX
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
@@ -555,9 +583,14 @@ CCOMMON_OPT += -march=mips64
|
||||
FCOMMON_OPT += -march=mips64
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), 1004K)
|
||||
CCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r2 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), P5600)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
CCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
FCOMMON_OPT += -mips32r5 -mnan=2008 -mtune=p5600 $(MSA_FLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), I6400)
|
||||
@@ -704,7 +737,7 @@ FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -906,6 +939,10 @@ ifeq ($(NO_AVX2), 1)
|
||||
CCOMMON_OPT += -DNO_AVX2
|
||||
endif
|
||||
|
||||
ifeq ($(NO_AVX512), 1)
|
||||
CCOMMON_OPT += -DNO_AVX512
|
||||
endif
|
||||
|
||||
ifdef SMP
|
||||
CCOMMON_OPT += -DSMP_SERVER
|
||||
|
||||
@@ -952,6 +989,8 @@ endif
|
||||
|
||||
CCOMMON_OPT += -DMAX_CPU_NUMBER=$(NUM_THREADS)
|
||||
|
||||
CCOMMON_OPT += -DMAX_PARALLEL_NUMBER=$(NUM_PARALLEL)
|
||||
|
||||
ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
@@ -1210,6 +1249,7 @@ export MSA_FLAGS
|
||||
export KERNELDIR
|
||||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
export NO_AVX512
|
||||
|
||||
export SGEMM_UNROLL_M
|
||||
export SGEMM_UNROLL_N
|
||||
|
||||
230
README.md
230
README.md
@@ -5,175 +5,219 @@
|
||||
Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS)
|
||||
|
||||
AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop)
|
||||
|
||||
## Introduction
|
||||
|
||||
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documents on OpenBLAS wiki pages <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
Please read the documentation on the OpenBLAS wiki pages: <http://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
## Binary Packages
|
||||
We provide binary packages for the following platform.
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
||||
* Windows x86/x86_64
|
||||
|
||||
You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/).
|
||||
|
||||
## Installation from Source
|
||||
Download from project homepage. http://xianyi.github.com/OpenBLAS/
|
||||
|
||||
Or, check out codes from git://github.com/xianyi/OpenBLAS.git
|
||||
Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
|
||||
### Dependencies
|
||||
|
||||
Building OpenBLAS requires the following to be installed:
|
||||
|
||||
* GNU Make
|
||||
* A C compiler, e.g. GCC or Clang
|
||||
* A Fortran compiler (optional, for LAPACK)
|
||||
* IBM MASS (optional, see below)
|
||||
|
||||
### Normal compile
|
||||
* type "make" to detect the CPU automatically.
|
||||
or
|
||||
* type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt.
|
||||
|
||||
Simply invoking `make` (or `gmake` on BSD) will detect the CPU automatically.
|
||||
To set a specific target CPU, use `make TARGET=xxx`, e.g. `make TARGET=NEHALEM`.
|
||||
The full target list is in the file `TargetList.txt`.
|
||||
|
||||
### Cross compile
|
||||
Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly.
|
||||
|
||||
Set `CC` and `FC` to point to the cross toolchains, and set `HOSTCC` to your host C compiler.
|
||||
The target must be specified explicitly when cross compiling.
|
||||
|
||||
Examples:
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU.
|
||||
* On an x86 box, compile this library for a loongson3a CPU:
|
||||
```sh
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
```
|
||||
|
||||
make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A
|
||||
|
||||
On X86 box, compile this library for loongson3a CPU with loongcc (based on Open64) compiler.
|
||||
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
* On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler:
|
||||
```sh
|
||||
make CC=loongcc FC=loongf95 HOSTCC=gcc TARGET=LOONGSON3A CROSS=1 CROSS_SUFFIX=mips64el-st-linux-gnu- NO_LAPACKE=1 NO_SHARED=1 BINARY=32
|
||||
```
|
||||
|
||||
### Debug version
|
||||
|
||||
make DEBUG=1
|
||||
A debug version can be built using `make DEBUG=1`.
|
||||
|
||||
### Compile with MASS Support on Power CPU (Optional dependency)
|
||||
### Compile with MASS support on Power CPU (optional)
|
||||
|
||||
[IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library consists of a set of mathematical functions for C, C++, and
|
||||
Fortran-language applications that are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as below -
|
||||
The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library
|
||||
consists of a set of mathematical functions for C, C++, and Fortran applications that are
|
||||
are tuned for optimum performance on POWER architectures.
|
||||
OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER.
|
||||
The library can be installed as shown:
|
||||
|
||||
* On Ubuntu:
|
||||
* On Ubuntu:
|
||||
```sh
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list
|
||||
sudo apt-get update
|
||||
sudo apt-get install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
wget -q http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/public.gpg -O- | sudo apt-key add -</br>
|
||||
echo "deb http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/ubuntu/ trusty main" | sudo tee /etc/apt/sources.list.d/ibm-xl-compiler-eval.list</br>
|
||||
sudo apt-get update</br>
|
||||
sudo apt-get install libxlmass-devel.8.1.5</br>
|
||||
* On RHEL/CentOS:
|
||||
```sh
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key
|
||||
sudo rpm --import repomd.xml.key
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/
|
||||
sudo yum install libxlmass-devel.8.1.5
|
||||
```
|
||||
|
||||
* On RHEL/CentOS:
|
||||
After installing the MASS library, compile OpenBLAS with `USE_MASS=1`.
|
||||
For example, to compile on Power8 with MASS support: `make USE_MASS=1 TARGET=POWER8`.
|
||||
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/repodata/repomd.xml.key</br>
|
||||
sudo rpm --import repomd.xml.key</br>
|
||||
wget http://public.dhe.ibm.com/software/server/POWER/Linux/xl-compiler/eval/ppc64le/rhel7/ibm-xl-compiler-eval.repo</br>
|
||||
sudo cp ibm-xl-compiler-eval.repo /etc/yum.repos.d/</br>
|
||||
sudo yum install libxlmass-devel.8.1.5</br>
|
||||
### Install to a specific directory (optional)
|
||||
|
||||
After installing MASS library, compile openblas with USE_MASS=1.
|
||||
Use `PREFIX=` when invoking `make`, for example
|
||||
|
||||
Example:
|
||||
```sh
|
||||
make install PREFIX=your_installation_directory
|
||||
```
|
||||
|
||||
Compiling on Power8 with MASS support -
|
||||
The default installation directory is `/opt/OpenBLAS`.
|
||||
|
||||
make USE_MASS=1 TARGET=POWER8
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
### Install to the directory (optional)
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
|
||||
Example:
|
||||
### Additional supported CPUs
|
||||
|
||||
make install PREFIX=your_installation_directory
|
||||
#### x86/x86-64
|
||||
|
||||
The default directory is /opt/OpenBLAS
|
||||
|
||||
## Support CPU & OS
|
||||
Please read GotoBLAS_01Readme.txt
|
||||
|
||||
### Additional support CPU:
|
||||
|
||||
#### x86/x86-64:
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar)
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations.
|
||||
|
||||
#### MIPS64:
|
||||
#### MIPS64
|
||||
|
||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||
- **ICT Loongson 3B**: Experimental
|
||||
|
||||
#### ARM:
|
||||
- **ARMV6**: Optimized BLAS for vfpv2 and vfpv3-d16 ( e.g. BCM2835, Cortex M0+ )
|
||||
- **ARMV7**: Optimized BLAS for vfpv3-d32 ( e.g. Cortex A8, A9 and A15 )
|
||||
#### ARM
|
||||
|
||||
#### ARM64:
|
||||
- **ARMV8**: Experimental
|
||||
- **ARMv6**: Optimized BLAS for vfpv2 and vfpv3-d16 (e.g. BCM2835, Cortex M0+)
|
||||
- **ARMv7**: Optimized BLAS for vfpv3-d32 (e.g. Cortex A8, A9 and A15)
|
||||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
|
||||
#### PPC/PPC64
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with USE_OPENMP=1
|
||||
|
||||
#### IBM zEnterprise System:
|
||||
- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1`
|
||||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
|
||||
|
||||
### Support OS:
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MingWin or Visual Studio(CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X.
|
||||
- **FreeBSD**: Supported by community. We didn't test the library on this OS.
|
||||
- **Android**: Supported by community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
|
||||
## Usages
|
||||
Link with libopenblas.a or -lopenblas for shared library.
|
||||
## Usage
|
||||
|
||||
### Set the number of threads with environment variables.
|
||||
Statically link with `libopenblas.a` or dynamically link with `-lopenblas` if OpenBLAS was
|
||||
compiled as a shared library.
|
||||
|
||||
Examples:
|
||||
### Setting the number of threads using environment variables
|
||||
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
Environment variables are used to specify a maximum number of threads.
|
||||
For example,
|
||||
|
||||
or
|
||||
```sh
|
||||
export OPENBLAS_NUM_THREADS=4
|
||||
export GOTO_NUM_THREADS=4
|
||||
export OMP_NUM_THREADS=4
|
||||
```
|
||||
|
||||
export GOTO_NUM_THREADS=4
|
||||
The priorities are `OPENBLAS_NUM_THREADS` > `GOTO_NUM_THREADS` > `OMP_NUM_THREADS`.
|
||||
|
||||
or
|
||||
If you compile this library with `USE_OPENMP=1`, you should set the `OMP_NUM_THREADS`
|
||||
environment variable; OpenBLAS ignores `OPENBLAS_NUM_THREADS` and `GOTO_NUM_THREADS` when
|
||||
compiled with `USE_OPENMP=1`.
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
### Setting the number of threads at runtime
|
||||
|
||||
The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS.
|
||||
We provide the following functions to control the number of threads at runtime:
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1.
|
||||
```c
|
||||
void goto_set_num_threads(int num_threads);
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
```
|
||||
|
||||
### Set the number of threads on runtime.
|
||||
If you compile this library with `USE_OPENMP=1`, you should use the above functions too.
|
||||
|
||||
We provided the below functions to control the number of threads on runtime.
|
||||
## Reporting bugs
|
||||
|
||||
void goto_set_num_threads(int num_threads);
|
||||
|
||||
void openblas_set_num_threads(int num_threads);
|
||||
|
||||
If you compile this lib with USE_OPENMP=1, you should use the above functions, too.
|
||||
|
||||
## Report Bugs
|
||||
Please add a issue in https://github.com/xianyi/OpenBLAS/issues
|
||||
Please submit an issue in https://github.com/xianyi/OpenBLAS/issues.
|
||||
|
||||
## Contact
|
||||
|
||||
* OpenBLAS users mailing list: https://groups.google.com/forum/#!forum/openblas-users
|
||||
* OpenBLAS developers mailing list: https://groups.google.com/forum/#!forum/openblas-dev
|
||||
|
||||
## ChangeLog
|
||||
Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version.
|
||||
## Change log
|
||||
|
||||
Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2 1.13 BSD version.
|
||||
|
||||
## Troubleshooting
|
||||
* Please read [Faq](https://github.com/xianyi/OpenBLAS/wiki/Faq) at first.
|
||||
* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/Cores should less than or equal to 256. On Linux x86_64(amd64), there is experimental support for up to 1024 CPUs/Cores and 128 numa nodes if you build the library with BIGNUMA=1.
|
||||
* OpenBLAS does not set processor affinity by default. On Linux, you can enable processor affinity by commenting the line NO_AFFINITY=1 in Makefile.rule. But this may cause [the conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell.
|
||||
|
||||
* Please read the [FAQ](https://github.com/xianyi/OpenBLAS/wiki/Faq) first.
|
||||
* Please use GCC version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MinGW/BSD.
|
||||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
On Linux, you can enable processor affinity by commenting out the line `NO_AFFINITY=1` in
|
||||
Makefile.rule. However, note that this may cause
|
||||
[a conflict with R parallel](https://stat.ethz.ch/pipermail/r-sig-hpc/2012-April/001348.html).
|
||||
* On Loongson 3A, `make test` may fail with a `pthread_create` error (`EAGAIN`).
|
||||
However, it will be okay when you run the same test case on the shell.
|
||||
|
||||
## Contributing
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue to start a discussion around a feature idea or a bug.
|
||||
1. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
1. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
1. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
1. [Check for open issues](https://github.com/xianyi/OpenBLAS/issues) or open a fresh issue
|
||||
to start a discussion around a feature idea or a bug.
|
||||
2. Fork the [OpenBLAS](https://github.com/xianyi/OpenBLAS) repository to start making your changes.
|
||||
3. Write a test which shows that the bug was fixed or that the feature works as expected.
|
||||
4. Send a pull request. Make sure to add yourself to `CONTRIBUTORS.md`.
|
||||
|
||||
## Donation
|
||||
|
||||
Please read [this wiki page](https://github.com/xianyi/OpenBLAS/wiki/Donation).
|
||||
|
||||
@@ -20,6 +20,7 @@ DUNNINGTON
|
||||
NEHALEM
|
||||
SANDYBRIDGE
|
||||
HASWELL
|
||||
SKYLAKEX
|
||||
ATOM
|
||||
|
||||
b)AMD CPU:
|
||||
@@ -56,6 +57,7 @@ CELL
|
||||
|
||||
3.MIPS CPU:
|
||||
P5600
|
||||
1004K
|
||||
|
||||
4.MIPS64 CPU:
|
||||
SICORTEX
|
||||
|
||||
14
USAGE.md
14
USAGE.md
@@ -14,6 +14,20 @@ Please build OpenBLAS with larger `NUM_THREADS`. For example, `make
|
||||
NUM_THREADS=32` or `make NUM_THREADS=64`. In `Makefile.system`, we will set
|
||||
`MAX_CPU_NUMBER=NUM_THREADS`.
|
||||
|
||||
Despite its name, and due to the use of memory buffers in functions like SGEMM,
|
||||
the setting of NUM_THREADS can be relevant even for a single-threaded build
|
||||
of OpenBLAS, if such functions get called by multiple threads of a program
|
||||
that uses OpenBLAS. In some cases, the affected code may simply crash or throw
|
||||
a segmentation fault without displaying the above warning first.
|
||||
|
||||
Note that the number of threads used at runtime can be altered to differ from the
|
||||
value NUM_THREADS was set to at build time. At runtime, the actual number of
|
||||
threads can be set anywhere from 1 to the build's NUM_THREADS (note however,
|
||||
that this does not change the number of memory buffers that will be allocated,
|
||||
which is set at build time). The number of threads for a process can be set by
|
||||
using the mechanisms described below.
|
||||
|
||||
|
||||
#### How can I use OpenBLAS in multi-threaded applications?
|
||||
|
||||
If your application is already multi-threaded, it will conflict with OpenBLAS
|
||||
|
||||
@@ -237,7 +237,7 @@ int main(int argc, char *argv[]){
|
||||
timeg = time1/loops;
|
||||
fprintf(stderr,
|
||||
" %10.2f MFlops %10.6f sec\n",
|
||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
COMPSIZE * COMPSIZE * 2. * (double)k * (double)m * (double)n / timeg * 1.e-6, time1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
18
c_check
18
c_check
@@ -54,6 +54,8 @@ $compiler = GCC if ($compiler eq "");
|
||||
$os = Linux if ($data =~ /OS_LINUX/);
|
||||
$os = FreeBSD if ($data =~ /OS_FREEBSD/);
|
||||
$os = NetBSD if ($data =~ /OS_NETBSD/);
|
||||
$os = OpenBSD if ($data =~ /OS_OPENBSD/);
|
||||
$os = DragonFly if ($data =~ /OS_DRAGONFLY/);
|
||||
$os = Darwin if ($data =~ /OS_DARWIN/);
|
||||
$os = SunOS if ($data =~ /OS_SUNOS/);
|
||||
$os = AIX if ($data =~ /OS_AIX/);
|
||||
@@ -199,6 +201,21 @@ $architecture = zarch if ($data =~ /ARCH_ZARCH/);
|
||||
$binformat = bin32;
|
||||
$binformat = bin64 if ($data =~ /BINARY_64/);
|
||||
|
||||
$no_avx512= 0;
|
||||
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "int main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -o $tmpf.o -x c $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_avx512 = 1;
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
}
|
||||
|
||||
$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
|
||||
|
||||
$data =~ /globl\s([_\.]*)(.*)/;
|
||||
@@ -286,6 +303,7 @@ print MAKEFILE "CROSS=1\n" if $cross != 0;
|
||||
print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
|
||||
print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
|
||||
print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
|
||||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
||||
@@ -56,6 +56,9 @@ if (DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX2)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
|
||||
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
|
||||
|
||||
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
|
||||
Name: OpenBLAS
|
||||
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
|
||||
Version: @OPENBLAS_VERSION@
|
||||
|
||||
@@ -33,7 +33,7 @@ endif ()
|
||||
if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
set(NO_AVX 1)
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE")
|
||||
if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX")
|
||||
set(TARGET "NEHALEM")
|
||||
endif ()
|
||||
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
|
||||
@@ -96,8 +96,12 @@ if (NOT CMAKE_CROSSCOMPILING)
|
||||
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_PARALLEL)
|
||||
set(NUM_PARALLEL 1)
|
||||
endif()
|
||||
|
||||
if (NOT DEFINED NUM_THREADS)
|
||||
if (NOT NUM_CORES EQUAL 0)
|
||||
if (DEFINED NUM_CORES AND NOT NUM_CORES EQUAL 0)
|
||||
# HT?
|
||||
set(NUM_THREADS ${NUM_CORES})
|
||||
else ()
|
||||
@@ -224,6 +228,8 @@ endif ()
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_CPU_NUMBER=${NUM_THREADS}")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_PARALLEL_NUMBER=${NUM_PARALLEL}")
|
||||
|
||||
if (USE_SIMPLE_THREADED_LEVEL3)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3")
|
||||
endif ()
|
||||
|
||||
@@ -66,3 +66,12 @@ else()
|
||||
set(BINARY32 1)
|
||||
endif()
|
||||
|
||||
if (X86_64 OR X86)
|
||||
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "int main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp RESULT_VARIABLE NO_AVX512)
|
||||
if (NO_AVX512 EQUAL 1)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
20
common.h
20
common.h
@@ -93,7 +93,7 @@ extern "C" {
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_ANDROID)
|
||||
#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_ANDROID)
|
||||
#include <sched.h>
|
||||
#endif
|
||||
|
||||
@@ -179,7 +179,7 @@ extern "C" {
|
||||
|
||||
#define ALLOCA_ALIGN 63UL
|
||||
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2)
|
||||
#define NUM_BUFFERS (MAX_CPU_NUMBER * 2 * MAX_PARALLEL_NUMBER)
|
||||
|
||||
#ifdef NEEDBUNDERSCORE
|
||||
#define BLASFUNC(FUNC) FUNC##_
|
||||
@@ -642,6 +642,7 @@ void gotoblas_profile_init(void);
|
||||
void gotoblas_profile_quit(void);
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
|
||||
#ifndef C_MSVC
|
||||
int omp_in_parallel(void);
|
||||
int omp_get_num_procs(void);
|
||||
@@ -649,6 +650,21 @@ int omp_get_num_procs(void);
|
||||
__declspec(dllimport) int __cdecl omp_in_parallel(void);
|
||||
__declspec(dllimport) int __cdecl omp_get_num_procs(void);
|
||||
#endif
|
||||
|
||||
#if (__STDC_VERSION__ >= 201112L)
|
||||
#if defined(C_GCC) && ( __GNUC__ < 7)
|
||||
// workaround for GCC bug 65467
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#endif
|
||||
#include <stdatomic.h>
|
||||
#else
|
||||
#ifndef _Atomic
|
||||
#define _Atomic volatile
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#else
|
||||
#ifdef __ELF__
|
||||
int omp_in_parallel (void) __attribute__ ((weak));
|
||||
|
||||
10
common_x86.h
10
common_x86.h
@@ -178,7 +178,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
result = x/y;
|
||||
return result;
|
||||
#else
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if ( y > 64) {
|
||||
result = x/y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
@@ -327,7 +333,7 @@ REALNAME:
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(__ELF__)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 16; \
|
||||
|
||||
@@ -196,6 +196,13 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||
|
||||
if (y <= 1) return x;
|
||||
|
||||
#if (MAX_CPU_NUMBER > 64)
|
||||
if (y > 64) {
|
||||
result = x / y;
|
||||
return result;
|
||||
}
|
||||
#endif
|
||||
|
||||
y = blas_quick_divide_table[y];
|
||||
|
||||
__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
|
||||
@@ -403,7 +410,7 @@ REALNAME:
|
||||
#define EPILOGUE .end
|
||||
#endif
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI)
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(__ELF__) || defined(C_PGI)
|
||||
#define PROLOGUE \
|
||||
.text; \
|
||||
.align 512; \
|
||||
|
||||
3
cpuid.h
3
cpuid.h
@@ -115,6 +115,7 @@
|
||||
#define CORE_STEAMROLLER 25
|
||||
#define CORE_EXCAVATOR 26
|
||||
#define CORE_ZEN 27
|
||||
#define CORE_SKYLAKEX 28
|
||||
|
||||
#define HAVE_SSE (1 << 0)
|
||||
#define HAVE_SSE2 (1 << 1)
|
||||
@@ -137,6 +138,7 @@
|
||||
#define HAVE_AVX (1 << 18)
|
||||
#define HAVE_FMA4 (1 << 19)
|
||||
#define HAVE_FMA3 (1 << 20)
|
||||
#define HAVE_AVX512VL (1 << 21)
|
||||
|
||||
#define CACHE_INFO_L1_I 1
|
||||
#define CACHE_INFO_L1_D 2
|
||||
@@ -211,5 +213,6 @@ typedef struct {
|
||||
#define CPUTYPE_STEAMROLLER 49
|
||||
#define CPUTYPE_EXCAVATOR 50
|
||||
#define CPUTYPE_ZEN 51
|
||||
#define CPUTYPE_SKYLAKEX 52
|
||||
|
||||
#endif
|
||||
|
||||
@@ -121,7 +121,7 @@ int detect(void)
|
||||
return CPU_VULCAN;
|
||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX;
|
||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */
|
||||
else if (strstr(cpu_part, "0x0af") && strstr(cpu_implementer, "0x43"))
|
||||
return CPU_THUNDERX2T99;
|
||||
}
|
||||
|
||||
|
||||
58
cpuid_mips.c
58
cpuid_mips.c
@@ -72,10 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_P5600 1
|
||||
#define CPU_1004K 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"P5600"
|
||||
"P5600",
|
||||
"1004K"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
@@ -90,7 +92,7 @@ int detect(void){
|
||||
if (!strncmp("cpu", buffer, 3)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
fprintf(stderr, "%s \n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
@@ -99,43 +101,13 @@ int detect(void){
|
||||
fclose(infile);
|
||||
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}else if (strstr(p, "Loongson-3")){
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("system type", buffer, 11)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if (strstr(p, "loongson3a"))
|
||||
return CPU_LOONGSON3A;
|
||||
}else{
|
||||
if (strstr(p, "5600")) {
|
||||
return CPU_P5600;
|
||||
} else if (strstr(p, "1004K")) {
|
||||
return CPU_1004K;
|
||||
} else
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
}
|
||||
//Check model name for Loongson3
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
p = (char *)NULL;
|
||||
while (fgets(buffer, sizeof(buffer), infile)){
|
||||
if (!strncmp("model name", buffer, 10)){
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
fclose(infile);
|
||||
if(p != NULL){
|
||||
if (strstr(p, "Loongson-3A")){
|
||||
return CPU_LOONGSON3A;
|
||||
}else if(strstr(p, "Loongson-3B")){
|
||||
return CPU_LOONGSON3B;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
return CPU_UNKNOWN;
|
||||
}
|
||||
@@ -149,7 +121,7 @@ void get_architecture(void){
|
||||
}
|
||||
|
||||
void get_subarchitecture(void){
|
||||
if(detect()==CPU_P5600){
|
||||
if(detect()==CPU_P5600|| detect()==CPU_1004K){
|
||||
printf("P5600");
|
||||
}else{
|
||||
printf("UNKNOWN");
|
||||
@@ -170,6 +142,14 @@ void get_cpuconfig(void){
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("#define MIPS1004K\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 26144\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 8\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
}else{
|
||||
printf("#define UNKNOWN\n");
|
||||
}
|
||||
@@ -178,6 +158,8 @@ void get_cpuconfig(void){
|
||||
void get_libname(void){
|
||||
if(detect()==CPU_P5600) {
|
||||
printf("p5600\n");
|
||||
} else if (detect()==CPU_1004K) {
|
||||
printf("1004K\n");
|
||||
}else{
|
||||
printf("mips\n");
|
||||
}
|
||||
|
||||
32
cpuid_x86.c
32
cpuid_x86.c
@@ -50,6 +50,8 @@
|
||||
#ifdef NO_AVX
|
||||
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||
#define CORE_HASWELL CORE_NEHALEM
|
||||
#define CPUTYPE_SKYLAKEX CPUTYPE_NEHALEM
|
||||
#define CORE_SKYLAKEX CORE_NEHALEM
|
||||
#define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM
|
||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||
@@ -1299,6 +1301,19 @@ int get_cpuname(void){
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CPUTYPE_HASWELL;
|
||||
#else
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
@@ -1556,6 +1571,7 @@ static char *cpuname[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
};
|
||||
|
||||
static char *lowercpuname[] = {
|
||||
@@ -1610,6 +1626,7 @@ static char *lowercpuname[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
};
|
||||
|
||||
static char *corename[] = {
|
||||
@@ -1641,6 +1658,7 @@ static char *corename[] = {
|
||||
"STEAMROLLER",
|
||||
"EXCAVATOR",
|
||||
"ZEN",
|
||||
"SKYLAKEX"
|
||||
};
|
||||
|
||||
static char *corename_lower[] = {
|
||||
@@ -1672,6 +1690,7 @@ static char *corename_lower[] = {
|
||||
"steamroller",
|
||||
"excavator",
|
||||
"zen",
|
||||
"skylakex"
|
||||
};
|
||||
|
||||
|
||||
@@ -1860,6 +1879,19 @@ int get_coretype(void){
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 5:
|
||||
// Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return CORE_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
return CORE_HASWELL;
|
||||
#else
|
||||
return CORE_SANDYBRIDGE;
|
||||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
#endif
|
||||
case 14:
|
||||
// Skylake
|
||||
if(support_avx())
|
||||
|
||||
8
ctest.c
8
ctest.c
@@ -60,6 +60,14 @@ OS_FREEBSD
|
||||
OS_NETBSD
|
||||
#endif
|
||||
|
||||
#if defined(__OpenBSD__)
|
||||
OS_OPENBSD
|
||||
#endif
|
||||
|
||||
#if defined(__DragonFly__)
|
||||
OS_DRAGONFLY
|
||||
#endif
|
||||
|
||||
#if defined(__sun)
|
||||
OS_SUNOS
|
||||
#endif
|
||||
|
||||
@@ -362,7 +362,7 @@ cgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -410,7 +410,7 @@ zgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -458,7 +458,7 @@ xgemm_ct.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_cr.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_cc.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -558,7 +558,7 @@ cgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -606,7 +606,7 @@ zgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -654,7 +654,7 @@ xgemm_thread_ct.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_thread_cr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_thread_cc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1821,7 +1821,7 @@ cgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1869,7 +1869,7 @@ zgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1917,7 +1917,7 @@ xgemm3m_ct.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_cr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_cc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -1974,7 +1974,7 @@ cgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2022,7 +2022,7 @@ zgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2070,7 +2070,7 @@ xgemm3m_thread_ct.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2731,7 +2731,7 @@ cgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2779,7 +2779,7 @@ zgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2827,7 +2827,7 @@ xgemm_ct.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_cr.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_cc.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2927,7 +2927,7 @@ cgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -2975,7 +2975,7 @@ zgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -3023,7 +3023,7 @@ xgemm_thread_ct.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm_thread_cr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm_thread_cc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4190,7 +4190,7 @@ cgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4238,7 +4238,7 @@ zgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4286,7 +4286,7 @@ xgemm3m_ct.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_cr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_cc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4343,7 +4343,7 @@ cgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
cgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4391,7 +4391,7 @@ zgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
zgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
@@ -4439,7 +4439,7 @@ xgemm3m_thread_ct.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCT $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR $< -o $(@F)
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCR=CR $< -o $(@F)
|
||||
|
||||
xgemm3m_thread_cc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCC $< -o $(@F)
|
||||
|
||||
@@ -91,7 +91,12 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
||||
@@ -67,7 +67,12 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
||||
@@ -91,7 +91,12 @@
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
volatile BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Atomic
|
||||
#else
|
||||
volatile
|
||||
#endif
|
||||
BLASLONG working[MAX_CPU_NUMBER][CACHE_LINE_SIZE * DIVIDE_RATE];
|
||||
} job_t;
|
||||
|
||||
|
||||
|
||||
@@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_NETBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_SUNOS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#include <dlfcn.h>
|
||||
#include <signal.h>
|
||||
#include <sys/resource.h>
|
||||
|
||||
@@ -36,6 +36,7 @@
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
//#include <sys/mman.h>
|
||||
@@ -49,11 +50,16 @@
|
||||
|
||||
int blas_server_avail = 0;
|
||||
|
||||
static void * blas_thread_buffer[MAX_CPU_NUMBER];
|
||||
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#else
|
||||
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
|
||||
#endif
|
||||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
@@ -68,15 +74,17 @@ void goto_set_num_threads(int num_threads) {
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
//adjust buffer for each thread
|
||||
for(i=0; i<blas_cpu_number; i++){
|
||||
if(blas_thread_buffer[i]==NULL){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_cpu_number; j++){
|
||||
if(blas_thread_buffer[i][j]==NULL){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
}
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
#if defined(ARCH_MIPS64)
|
||||
@@ -92,30 +100,34 @@ void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
int blas_thread_init(void){
|
||||
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
blas_server_avail = 1;
|
||||
|
||||
for(i=0; i<blas_num_threads; i++){
|
||||
blas_thread_buffer[i]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; i<MAX_CPU_NUMBER; i++){
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<blas_num_threads; j++){
|
||||
blas_thread_buffer[i][j]=blas_memory_alloc(2);
|
||||
}
|
||||
for(; j<MAX_CPU_NUMBER; j++){
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
int i=0;
|
||||
int i=0, j=0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
for(i=0; i<MAX_CPU_NUMBER; i++){
|
||||
if(blas_thread_buffer[i]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i]);
|
||||
blas_thread_buffer[i]=NULL;
|
||||
for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
|
||||
for(j=0; j<MAX_CPU_NUMBER; j++){
|
||||
if(blas_thread_buffer[i][j]!=NULL){
|
||||
blas_memory_free(blas_thread_buffer[i][j]);
|
||||
blas_thread_buffer[i][j]=NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -206,7 +218,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
}
|
||||
}
|
||||
|
||||
static void exec_threads(blas_queue_t *queue){
|
||||
static void exec_threads(blas_queue_t *queue, int buf_index){
|
||||
|
||||
void *buffer, *sa, *sb;
|
||||
int pos=0, release_flag=0;
|
||||
@@ -223,7 +235,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
|
||||
|
||||
pos = omp_get_thread_num();
|
||||
buffer = blas_thread_buffer[pos];
|
||||
buffer = blas_thread_buffer[buf_index][pos];
|
||||
|
||||
//fallback
|
||||
if(buffer==NULL) {
|
||||
@@ -291,7 +303,7 @@ static void exec_threads(blas_queue_t *queue){
|
||||
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
|
||||
BLASLONG i;
|
||||
BLASLONG i, buf_index;
|
||||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
@@ -302,6 +314,23 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
}
|
||||
#endif
|
||||
|
||||
while(true) {
|
||||
for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
_Bool inuse = false;
|
||||
if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
|
||||
#else
|
||||
if(blas_buffer_inuse[i] == false) {
|
||||
blas_buffer_inuse[i] = true;
|
||||
#endif
|
||||
buf_index = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(i != MAX_PARALLEL_NUMBER)
|
||||
break;
|
||||
}
|
||||
|
||||
#pragma omp parallel for schedule(static)
|
||||
for (i = 0; i < num; i ++) {
|
||||
|
||||
@@ -309,9 +338,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
queue[i].position = i;
|
||||
#endif
|
||||
|
||||
exec_threads(&queue[i]);
|
||||
exec_threads(&queue[i], buf_index);
|
||||
}
|
||||
|
||||
#if __STDC_VERSION__ >= 201112L
|
||||
atomic_store(&blas_buffer_inuse[buf_index], false);
|
||||
#else
|
||||
blas_buffer_inuse[buf_index] = false;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@@ -74,15 +74,22 @@ extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
@@ -284,8 +291,21 @@ static gotoblas_t *get_coretype(void){
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
#ifndef NO_AVX512
|
||||
return &gotoblas_SKYLAKEX;
|
||||
#else
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14 || model == 5) {
|
||||
if (model == 14) {
|
||||
if(support_avx())
|
||||
return &gotoblas_HASWELL;
|
||||
else{
|
||||
@@ -445,7 +465,8 @@ static char *corename[] = {
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen"
|
||||
"Zen",
|
||||
"SkylakeX"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
@@ -473,7 +494,7 @@ char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -485,7 +506,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 23; i++)
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
@@ -503,6 +524,7 @@ static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 24: return (&gotoblas_SKYLAKEX);
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
|
||||
@@ -108,7 +108,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -147,9 +147,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#else
|
||||
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
|
||||
#define CONSTRUCTOR __attribute__ ((constructor(101)))
|
||||
#define DESTRUCTOR __attribute__ ((destructor(101)))
|
||||
#else
|
||||
#define CONSTRUCTOR __attribute__ ((constructor))
|
||||
#define DESTRUCTOR __attribute__ ((destructor))
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
@@ -177,7 +180,7 @@ int get_num_procs(void) {
|
||||
cpu_set_t *cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
// int i,n;
|
||||
int i,n;
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
@@ -209,7 +212,8 @@ int ret;
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) return nums;
|
||||
nums = CPU_COUNT_S(size,cpusetp);
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
#endif
|
||||
@@ -246,7 +250,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -336,7 +340,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -344,7 +348,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -368,7 +372,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -455,11 +459,15 @@ static void *alloc_mmap(void *address){
|
||||
}
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
@@ -601,14 +609,18 @@ static void *alloc_mmap(void *address){
|
||||
#if defined(OS_LINUX) && !defined(NO_WARMUP)
|
||||
}
|
||||
#endif
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (map_address != (void *)-1) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
release_info[release_pos].address = map_address;
|
||||
release_info[release_pos].func = alloc_mmap_free;
|
||||
release_pos ++;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
return map_address;
|
||||
}
|
||||
@@ -1007,6 +1019,11 @@ void *blas_memory_alloc(int procpos){
|
||||
NULL,
|
||||
};
|
||||
void *(**func)(void *address);
|
||||
|
||||
#if defined(USE_OPENMP)
|
||||
if (!memory_initialized) {
|
||||
#endif
|
||||
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
if (!memory_initialized) {
|
||||
@@ -1042,6 +1059,9 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#if defined(USE_OPENMP)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Alloc Start ...\n");
|
||||
@@ -1056,13 +1076,17 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
do {
|
||||
if (!memory[position].used && (memory[position].pos == mypos)) {
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
}
|
||||
|
||||
position ++;
|
||||
@@ -1075,15 +1099,19 @@ void *blas_memory_alloc(int procpos){
|
||||
position = 0;
|
||||
|
||||
do {
|
||||
/* if (!memory[position].used) { */
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
/* blas_lock(&memory[position].lock);*/
|
||||
|
||||
#else
|
||||
if (!memory[position].used) {
|
||||
blas_lock(&memory[position].lock);
|
||||
#endif
|
||||
if (!memory[position].used) goto allocation;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
/* } */
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
}
|
||||
#endif
|
||||
|
||||
position ++;
|
||||
|
||||
@@ -1098,9 +1126,11 @@ void *blas_memory_alloc(int procpos){
|
||||
#endif
|
||||
|
||||
memory[position].used = 1;
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
/* blas_unlock(&memory[position].lock);*/
|
||||
#else
|
||||
blas_unlock(&memory[position].lock);
|
||||
#endif
|
||||
|
||||
if (!memory[position].addr) {
|
||||
do {
|
||||
@@ -1146,9 +1176,13 @@ void *blas_memory_alloc(int procpos){
|
||||
|
||||
} while ((BLASLONG)map_address == -1);
|
||||
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
memory[position].addr = map_address;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
|
||||
@@ -1202,8 +1236,9 @@ void blas_memory_free(void *free_area){
|
||||
#endif
|
||||
|
||||
position = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
LOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
|
||||
position++;
|
||||
|
||||
@@ -1217,7 +1252,9 @@ void blas_memory_free(void *free_area){
|
||||
WMB;
|
||||
|
||||
memory[position].used = 0;
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
|
||||
#ifdef DEBUG
|
||||
printf("Unmap Succeeded.\n\n");
|
||||
@@ -1232,8 +1269,9 @@ void blas_memory_free(void *free_area){
|
||||
for (position = 0; position < NUM_BUFFERS; position++)
|
||||
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
|
||||
#endif
|
||||
#if defined(SMP) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
@@ -54,6 +54,9 @@ static char* openblas_config_str=""
|
||||
#ifdef NO_AFFINITY
|
||||
"NO_AFFINITY "
|
||||
#endif
|
||||
#ifdef USE_OPENMP
|
||||
"USE_OPENMP "
|
||||
#endif
|
||||
#ifndef DYNAMIC_ARCH
|
||||
CHAR_CORENAME
|
||||
#endif
|
||||
@@ -61,18 +64,23 @@ static char* openblas_config_str=""
|
||||
|
||||
#ifdef DYNAMIC_ARCH
|
||||
char *gotoblas_corename();
|
||||
static char tmp_config_str[256];
|
||||
#endif
|
||||
|
||||
static char tmp_config_str[256];
|
||||
int openblas_get_parallel();
|
||||
|
||||
char* CNAME() {
|
||||
#ifndef DYNAMIC_ARCH
|
||||
return openblas_config_str;
|
||||
#else
|
||||
char tmpstr[20];
|
||||
strcpy(tmp_config_str, openblas_config_str);
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
return tmp_config_str;
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
|
||||
@@ -83,3 +91,4 @@ char* openblas_get_corename() {
|
||||
return gotoblas_corename();
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ int get_L2_size(void){
|
||||
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
|
||||
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
|
||||
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
|
||||
cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
@@ -251,7 +251,7 @@ int get_L2_size(void){
|
||||
void blas_set_parameter(void){
|
||||
|
||||
int factor;
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN)
|
||||
#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
|
||||
int size = 16;
|
||||
#else
|
||||
int size = get_L2_size();
|
||||
|
||||
@@ -128,6 +128,8 @@ so : ../$(LIBSONAME)
|
||||
|
||||
ifeq ($(OSNAME), Android)
|
||||
INTERNALNAME = $(LIBPREFIX).so
|
||||
FEXTRALIB += -lm
|
||||
EXTRALIB += -lm
|
||||
else
|
||||
INTERNALNAME = $(LIBPREFIX).so.$(MAJOR_VERSION)
|
||||
endif
|
||||
@@ -156,7 +158,7 @@ endif
|
||||
endif
|
||||
|
||||
#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD NetBSD))
|
||||
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
|
||||
|
||||
so : ../$(LIBSONAME)
|
||||
|
||||
|
||||
6
f_check
6
f_check
@@ -97,7 +97,7 @@ if ($compiler eq "") {
|
||||
|
||||
if ($data =~ /Intel/) {
|
||||
$vendor = INTEL;
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($data =~ /Sun Fortran/) {
|
||||
@@ -127,7 +127,7 @@ if ($compiler eq "") {
|
||||
|
||||
# for embeded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data =~ /zho_ge__/) {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
}
|
||||
@@ -155,7 +155,7 @@ if ($compiler eq "") {
|
||||
if ($compiler =~ /ifort/) {
|
||||
$vendor = INTEL;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pathf/) {
|
||||
|
||||
23
getarch.c
23
getarch.c
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -326,6 +326,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define CORENAME "HASWELL"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_SKYLAKEX
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
#define ARCHITECTURE "X86"
|
||||
#define SUBARCHITECTURE "SKYLAKEX"
|
||||
#define ARCHCONFIG "-DSKYLAKEX " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
|
||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
|
||||
"-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
|
||||
#define LIBNAME "skylakex"
|
||||
#define CORENAME "SKYLAKEX"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_ATOM
|
||||
#define FORCE
|
||||
#define FORCE_INTEL
|
||||
@@ -1074,7 +1089,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1088,7 +1103,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
@@ -1181,9 +1196,7 @@ int main(int argc, char *argv[]){
|
||||
#elif NO_PARALLEL_MAKE==1
|
||||
printf("MAKE += -j 1\n");
|
||||
#else
|
||||
#ifndef OS_WINDOWS
|
||||
printf("MAKE += -j %d\n", get_num_cores());
|
||||
#endif
|
||||
#endif
|
||||
|
||||
break;
|
||||
|
||||
@@ -44,6 +44,7 @@
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#define SMP_THRESHOLD_MIN 65536.0
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGEMM "
|
||||
#elif defined(DOUBLE)
|
||||
@@ -52,6 +53,7 @@
|
||||
#define ERROR_NAME "SGEMM "
|
||||
#endif
|
||||
#else
|
||||
#define SMP_THRESHOLD_MIN 8192.0
|
||||
#ifndef GEMM3M
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGEMM "
|
||||
@@ -121,8 +123,6 @@ void NAME(char *TRANSA, char *TRANSB,
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
@@ -245,8 +245,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
XFLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
int nthreads_max;
|
||||
int nthreads_avail;
|
||||
double MNK;
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
@@ -411,25 +409,12 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
|
||||
nthreads_max = num_cpu_avail(3);
|
||||
nthreads_avail = nthreads_max;
|
||||
|
||||
#ifndef COMPLEX
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
nthreads_max = 1;
|
||||
#else
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (8192.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
nthreads_max = 1;
|
||||
#endif
|
||||
args.common = NULL;
|
||||
|
||||
if ( nthreads_max > nthreads_avail )
|
||||
args.nthreads = nthreads_avail;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = nthreads_max;
|
||||
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
args.common = NULL;
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
@@ -64,6 +64,13 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
|
||||
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
|
||||
|
||||
if (*dd2 == ZERO || dy1 == ZERO)
|
||||
{
|
||||
dflag = -TWO;
|
||||
dparam[0] = dflag;
|
||||
return;
|
||||
}
|
||||
|
||||
if(*dd1 < ZERO)
|
||||
{
|
||||
dflag = -ONE;
|
||||
@@ -76,6 +83,16 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
else if ((*dd1 == ZERO || *dx1 == ZERO) && *dd2 > ZERO)
|
||||
{
|
||||
dflag = ONE;
|
||||
dh12 = 1;
|
||||
dh21 = -1;
|
||||
*dx1 = dy1;
|
||||
dtemp = *dd1;
|
||||
*dd1 = *dd2;
|
||||
*dd2 = dtemp;
|
||||
}
|
||||
else
|
||||
{
|
||||
dp2 = *dd2 * dy1;
|
||||
@@ -90,6 +107,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
dq1 = dp1 * *dx1;
|
||||
if(ABS(dq1) > ABS(dq2))
|
||||
{
|
||||
dflag = ZERO;
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dh21 = - dy1 / *dx1;
|
||||
dh12 = dp2 / dp1;
|
||||
|
||||
@@ -100,8 +120,19 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
*dd1 = *dd1 / du;
|
||||
*dd2 = *dd2 / du;
|
||||
*dx1 = *dx1 * du;
|
||||
} else {
|
||||
dflag = -ONE;
|
||||
|
||||
dh11 = ZERO;
|
||||
dh12 = ZERO;
|
||||
dh21 = ZERO;
|
||||
dh22 = ZERO;
|
||||
|
||||
*dd1 = ZERO;
|
||||
*dd2 = ZERO;
|
||||
*dx1 = ZERO;
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -120,7 +151,9 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
}
|
||||
else
|
||||
{
|
||||
dflag = ONE;
|
||||
dflag = ONE;
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
|
||||
dh11 = dp1 / dp2;
|
||||
dh22 = *dx1 / dy1;
|
||||
@@ -134,76 +167,33 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||
}
|
||||
|
||||
|
||||
if(*dd1 != ZERO)
|
||||
while ( *dd1 <= RGAMSQ && *dd1 != ZERO)
|
||||
{
|
||||
if( (*dd1 <= RGAMSQ) || (*dd1 >= GAMSQ) )
|
||||
{
|
||||
if(dflag == ZERO)
|
||||
{
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
if( *dd1 <= RGAMSQ )
|
||||
{
|
||||
while (ABS(*dd1) <= RGAMSQ) {
|
||||
*dd1 = *dd1 * (GAM * GAM);
|
||||
*dx1 = *dx1 / GAM;
|
||||
dh11 = dh11 / GAM;
|
||||
dh12 = dh12 / GAM;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ABS(*dd1) >= GAMSQ) {
|
||||
*dd1 = *dd1 / (GAM * GAM);
|
||||
*dx1 = *dx1 * GAM;
|
||||
dh11 = dh11 * GAM;
|
||||
dh12 = dh12 * GAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
dflag = -ONE;
|
||||
*dd1 = *dd1 * (GAM * GAM);
|
||||
*dx1 = *dx1 / GAM;
|
||||
dh11 = dh11 / GAM;
|
||||
dh12 = dh12 / GAM;
|
||||
}
|
||||
while (ABS(*dd1) > GAMSQ) {
|
||||
dflag = -ONE;
|
||||
*dd1 = *dd1 / (GAM * GAM);
|
||||
*dx1 = *dx1 * GAM;
|
||||
dh11 = dh11 * GAM;
|
||||
dh12 = dh12 * GAM;
|
||||
}
|
||||
|
||||
if(*dd2 != ZERO)
|
||||
{
|
||||
if( (ABS(*dd2) <= RGAMSQ) || (ABS(*dd2) >= GAMSQ) )
|
||||
{
|
||||
if(dflag == ZERO)
|
||||
{
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
else
|
||||
{
|
||||
dh21 = -ONE;
|
||||
dh12 = ONE;
|
||||
dflag = -ONE;
|
||||
}
|
||||
if( ABS(*dd2) <= RGAMSQ )
|
||||
{
|
||||
while (ABS(*dd2) <= RGAMSQ) {
|
||||
*dd2 = *dd2 * (GAM * GAM);
|
||||
dh21 = dh21 / GAM;
|
||||
dh22 = dh22 / GAM;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
while (ABS(*dd2) >= GAMSQ) {
|
||||
*dd2 = *dd2 / (GAM * GAM);
|
||||
dh21 = dh21 * GAM;
|
||||
dh22 = dh22 * GAM;
|
||||
}
|
||||
}
|
||||
}
|
||||
while (ABS(*dd2) <= RGAMSQ && *dd2 != ZERO) {
|
||||
dflag = -ONE;
|
||||
*dd2 = *dd2 * (GAM * GAM);
|
||||
dh21 = dh21 / GAM;
|
||||
dh22 = dh22 / GAM;
|
||||
}
|
||||
while (ABS(*dd2) > GAMSQ) {
|
||||
dflag = -ONE;
|
||||
*dd2 = *dd2 / (GAM * GAM);
|
||||
dh21 = dh21 * GAM;
|
||||
dh22 = dh22 * GAM;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -366,12 +366,13 @@ void CNAME(enum CBLAS_ORDER order,
|
||||
mode |= (trans << BLAS_TRANSA_SHIFT);
|
||||
mode |= (side << BLAS_RSIDE_SHIFT);
|
||||
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
|
||||
args.nthreads = 1;
|
||||
else
|
||||
args.nthreads = num_cpu_avail(3);
|
||||
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
|
||||
@@ -41,7 +41,11 @@
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#if defined(Z13)
|
||||
#define MULTI_THREAD_MINIMAL 200000
|
||||
#else
|
||||
#define MULTI_THREAD_MINIMAL 10000
|
||||
#endif
|
||||
#ifndef CBLAS
|
||||
|
||||
void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *y, blasint *INCY){
|
||||
@@ -69,7 +73,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||
#endif
|
||||
|
||||
#ifndef CBLAS
|
||||
PRINT_DEBUG_CNAME;
|
||||
PRINT_DEBUG_NAME;
|
||||
#else
|
||||
PRINT_DEBUG_CNAME;
|
||||
#endif
|
||||
@@ -93,6 +97,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
|
||||
if (incx == 0 || incy == 0)
|
||||
nthreads = 1;
|
||||
|
||||
//Work around the low performance issue with small imput size &
|
||||
//multithreads.
|
||||
if (n <= MULTI_THREAD_MINIMAL) {
|
||||
nthreads = 1;
|
||||
}
|
||||
if (nthreads == 1) {
|
||||
#endif
|
||||
|
||||
|
||||
@@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
|
||||
} else
|
||||
nthreads = 1;
|
||||
|
||||
/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
|
||||
nthreads = 1;
|
||||
|
||||
if(nthreads > 1) {
|
||||
buffer_size = n > 16 ? 0 : n * 4 + 40;
|
||||
}
|
||||
|
||||
@@ -121,7 +121,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
|
||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen")
|
||||
if (ARM OR ARM64 OR "${TARGET_CORE}" STREQUAL "LONGSOON3B" OR "${TARGET_CORE}" STREQUAL "GENERIC" OR "${CORE}" STREQUAL "generic" OR "${TARGET_CORE}" STREQUAL "HASWELL" OR "${CORE}" STREQUAL "haswell" OR "${CORE}" STREQUAL "zen" OR "${TARGET_CORE}" STREQUAL "SKYLAKEX" OR "${CORE}" STREQUAL "skylakex")
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -29,9 +29,11 @@ USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ZEN)
|
||||
|
||||
@@ -49,6 +49,7 @@ SDOTKERNEL = ../arm/dot.c
|
||||
DDOTKERNEL = ../arm/dot.c
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
|
||||
@@ -440,13 +440,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cmp N, #0
|
||||
ble axpy_kernel_L999
|
||||
|
||||
/*
|
||||
cmp INC_X, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
*/
|
||||
cmp INC_X, #1
|
||||
bne axpy_kernel_S_BEGIN
|
||||
|
||||
|
||||
@@ -483,13 +483,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
cmp N, #0
|
||||
ble rot_kernel_L999
|
||||
|
||||
/*
|
||||
cmp INC_X, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
*/
|
||||
cmp INC_X, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
|
||||
@@ -584,6 +584,12 @@ rot_kernel_S1:
|
||||
rot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
cmp INC_X, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
cmp INC_Y, #0
|
||||
beq rot_kernel_L999
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S10
|
||||
|
||||
@@ -49,6 +49,7 @@ SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
|
||||
@@ -29,6 +29,7 @@ SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
|
||||
@@ -74,8 +74,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DSDOT)
|
||||
fmadd DOTF, TMPX, TMPY, DOTF
|
||||
#else // DSDOT
|
||||
fmul TMPX, TMPX, TMPY
|
||||
fcvt d3, TMPY
|
||||
fcvt d2, TMPX
|
||||
fmul d2, d2, d3
|
||||
fadd DOTF, DOTF, d2
|
||||
#endif
|
||||
.endm
|
||||
@@ -87,12 +88,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DSDOT)
|
||||
fmla v0.4s, v2.4s, v3.4s
|
||||
#else
|
||||
fmul v2.4s, v2.4s, v3.4s
|
||||
ext v3.16b, v2.16b, v2.16b, #8
|
||||
fcvtl v2.2d, v2.2s
|
||||
fcvtl2 v5.2d, v3.4s
|
||||
fcvtl2 v4.2d, v2.4s
|
||||
fcvtl v3.2d, v3.2s
|
||||
fcvtl v2.2d, v2.2s
|
||||
fmul v4.2d, v4.2d, v5.2d
|
||||
fmul v2.2d, v2.2d, v3.2d
|
||||
fadd v2.2d, v2.2d, v4.2d
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
#endif
|
||||
#else //DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [X], #32
|
||||
@@ -136,8 +139,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DSDOT)
|
||||
fmadd DOTF, TMPX, TMPY, DOTF
|
||||
#else // DSDOT
|
||||
fmul TMPX, TMPX, TMPY
|
||||
fcvt d3, TMPY
|
||||
fcvt d2, TMPX
|
||||
fmul d2, d2, d3
|
||||
fadd DOTF, DOTF, d2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
@@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (m & 1) {
|
||||
|
||||
if (X > posY) {
|
||||
/* ao1 += 1;
|
||||
ao2 += 1; */
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
b += 2;
|
||||
} else
|
||||
#ifdef UNIT
|
||||
if (X < posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = ONE;
|
||||
b[ 1] = data02;
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
#endif
|
||||
ao1 += 2;
|
||||
b += 2;
|
||||
}
|
||||
#endif
|
||||
b[ 1] = *(ao1 + 1);
|
||||
b += 2;
|
||||
}
|
||||
|
||||
posY += 2;
|
||||
@@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
i = (m & 15);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i;
|
||||
@@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
a13 += i;
|
||||
a14 += i;
|
||||
a15 += i;
|
||||
a16 += i; */
|
||||
a16 += i;
|
||||
b += 16 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
@@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
i = (m & 7);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i;
|
||||
a05 += i;
|
||||
a06 += i;
|
||||
a07 += i;
|
||||
a08 += i; */
|
||||
a08 += i;
|
||||
b += 8 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
@@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
b += 8;
|
||||
}
|
||||
|
||||
/* a02 += i * lda;
|
||||
a02 += i * lda;
|
||||
a03 += i * lda;
|
||||
a04 += i * lda;
|
||||
a05 += i * lda;
|
||||
a06 += i * lda;
|
||||
a07 += i * lda;
|
||||
a08 += i * lda; */
|
||||
a08 += i * lda;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
@@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
i = (m & 3);
|
||||
if (i > 0) {
|
||||
if (X < posY) {
|
||||
/* a01 += i;
|
||||
a01 += i;
|
||||
a02 += i;
|
||||
a03 += i;
|
||||
a04 += i; */
|
||||
a04 += i;
|
||||
b += 4 * i;
|
||||
} else
|
||||
if (X > posY) {
|
||||
@@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
a01 += lda;
|
||||
b += 4;
|
||||
}
|
||||
/* a02 += lda;
|
||||
a02 += lda;
|
||||
a03 += lda;
|
||||
a04 += lda; */
|
||||
a04 += lda;
|
||||
} else {
|
||||
|
||||
#ifdef UNIT
|
||||
@@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (X < posY) {
|
||||
a01 ++;
|
||||
a02 ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(a01 + 0);
|
||||
#ifdef UNIT
|
||||
b[ 1] = *(a01 + 1);
|
||||
a01 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
b[ 1] = *(a01 + 1);
|
||||
#else
|
||||
b[ 0] = *(a01 + 0);
|
||||
b[ 1] = *(a01 + 1);
|
||||
#endif
|
||||
b[ 1] = *(a01 + 1);
|
||||
}
|
||||
b += 2;
|
||||
b += 2;
|
||||
}
|
||||
}
|
||||
posY += 2;
|
||||
}
|
||||
@@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (i > 0) {
|
||||
do {
|
||||
if (X < posY) {
|
||||
a01 ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
a01 += 1;
|
||||
b ++;
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(a01 + 0);
|
||||
#ifdef UNIT
|
||||
a01 += lda;
|
||||
b ++;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#else
|
||||
b[ 0] = *(a01 + 0);
|
||||
#endif
|
||||
a01 += lda;
|
||||
}
|
||||
b ++;
|
||||
X ++;
|
||||
i --;
|
||||
a01 += lda;
|
||||
b ++;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (m & 1) {
|
||||
|
||||
if (X < posY) {
|
||||
/* ao1 += 1;
|
||||
ao2 += 1; */
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
@@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
b[ 0] = data01;
|
||||
b[ 1] = ZERO;
|
||||
#endif
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
}
|
||||
}
|
||||
@@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
i = m;
|
||||
if (m > 0) {
|
||||
do {
|
||||
if (X < posY) {
|
||||
b += 1;
|
||||
ao1 += 1;
|
||||
} else
|
||||
if (X > posY) {
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
b += 1;
|
||||
ao1 += lda;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
if (X > posY) {
|
||||
b[ 0] = ONE;
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
} else {
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#endif
|
||||
b ++;
|
||||
ao1 += lda;
|
||||
X ++;
|
||||
b += 1;
|
||||
ao1 += lda;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
@@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (X < posY) {
|
||||
|
||||
if (m & 2) {
|
||||
/* ao1 += 2;
|
||||
ao1 += 2;
|
||||
ao2 += 2;
|
||||
ao3 += 2;
|
||||
ao4 += 2; */
|
||||
ao4 += 2;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
/* ao1 += 1;
|
||||
ao1 += 1;
|
||||
ao2 += 1;
|
||||
ao3 += 1;
|
||||
ao4 += 1; */
|
||||
ao4 += 1;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
@@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
b[ 7] = data08;
|
||||
|
||||
ao1 += 2 * lda;
|
||||
// ao2 += 2 * lda;
|
||||
ao2 += 2 * lda;
|
||||
b += 8;
|
||||
}
|
||||
|
||||
@@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 4;
|
||||
}
|
||||
|
||||
@@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
if (i) {
|
||||
|
||||
if (X < posY) {
|
||||
// ao1 += 2;
|
||||
ao1 += 2;
|
||||
b += 2;
|
||||
} else
|
||||
if (X > posY) {
|
||||
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
|
||||
// ao1 += lda;
|
||||
ao1 += lda;
|
||||
b += 2;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
@@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
do {
|
||||
|
||||
if (X < posY) {
|
||||
b += 1;
|
||||
ao1 += 1;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
} else
|
||||
if (X > posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
#ifdef UNIT
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
ao1 += lda;
|
||||
b += 1;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
b[ 0] = ONE;
|
||||
}
|
||||
#else
|
||||
data01 = *(ao1 + 0);
|
||||
b[ 0] = data01;
|
||||
#endif
|
||||
ao1 += lda;
|
||||
}
|
||||
b ++;
|
||||
X ++;
|
||||
ao1 += lda;
|
||||
b += 1;
|
||||
}
|
||||
|
||||
X += 1;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||
}
|
||||
|
||||
a1 += 2 * lda;
|
||||
// a2 += 2 * lda;
|
||||
a2 += 2 * lda;
|
||||
b += 8;
|
||||
|
||||
ii += 2;
|
||||
|
||||
@@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
#ifdef UNIT
|
||||
|
||||
if (X > posY) {
|
||||
ao1 += 2;
|
||||
ao2 += 2;
|
||||
b += 4;
|
||||
|
||||
} else
|
||||
if (X < posY) {
|
||||
#endif
|
||||
b[ 0] = *(ao1 + 0);
|
||||
b[ 1] = *(ao1 + 1);
|
||||
#ifdef UNIT
|
||||
data1 = *(ao1 + 0);
|
||||
data2 = *(ao1 + 1);
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = data1;
|
||||
b[ 1] = data2;
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
|
||||
ao1 += lda;
|
||||
b += 4;
|
||||
} else {
|
||||
#ifdef UNIT
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = ONE;
|
||||
b[ 1] = ZERO;
|
||||
}
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
#else
|
||||
data1 = *(ao1 + 0);
|
||||
data2 = *(ao1 + 1);
|
||||
data3 = *(ao1 + 2);
|
||||
data4 = *(ao1 + 3);
|
||||
|
||||
b[ 0] = data1;
|
||||
b[ 1] = data2;
|
||||
b[ 2] = data3;
|
||||
b[ 3] = data4;
|
||||
#endif
|
||||
b += 4;
|
||||
b += 4;
|
||||
}
|
||||
}
|
||||
|
||||
posY += 2;
|
||||
@@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
// posY += 1;
|
||||
posY += 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||
|
||||
BLASLONG i, ii, j, jj;
|
||||
|
||||
FLOAT data01 = 0.0, data02 = 0.0;
|
||||
FLOAT data01, data02;
|
||||
FLOAT *a1;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
@@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
||||
|
||||
BLASLONG i, ii, j, jj;
|
||||
|
||||
FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
|
||||
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
|
||||
FLOAT data01, data02, data03, data04;
|
||||
FLOAT data05, data06, data07, data08;
|
||||
FLOAT *a1, *a2;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
1
kernel/mips/KERNEL.1004K
Normal file
1
kernel/mips/KERNEL.1004K
Normal file
@@ -0,0 +1 @@
|
||||
include $(KERNELDIR)/KERNEL.P5600
|
||||
@@ -38,8 +38,8 @@ ZASUMKERNEL = ../mips/zasum_msa.c
|
||||
else
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/asum.c
|
||||
ZASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
@@ -253,4 +253,4 @@ ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v2f64 v_alpha;
|
||||
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
|
||||
v2f64 x0, x1, x2, x3, y0 = {0,0}, y1 = {0,0}, y2 = {0,0}, y3 = {0,0};
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
||||
v2f64 t30, t31, tp0 = {0,0}, tp1 = {0,0}, tp2 = {0,0}, tp3 = {0,0}, tp4 = {0,0}, tp5 = {0,0}, tp6 = {0,0}, tp7 = {0,0};
|
||||
|
||||
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
||||
|
||||
|
||||
@@ -41,8 +41,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
||||
dot += y[iy] * x[ix] ;
|
||||
#if defined(DSDOT)
|
||||
dot += (double)y[iy] * (double)x[ix] ;
|
||||
#else
|
||||
dot += y[iy] * x[ix];
|
||||
#endif
|
||||
ix += inc_x ;
|
||||
iy += inc_y ;
|
||||
i++ ;
|
||||
|
||||
@@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
|
||||
v4f32 v_alpha, x0, x1, y0 = {0,0,0,0}, y1 = {0,0,0,0};
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;
|
||||
v4f32 tp0 = {0,0,0,0}, tp1 = {0,0,0,0}, tp2 = {0,0,0,0}, tp3 = {0,0,0,0}, tp4 = {0,0,0,0}, tp5 = {0,0,0,0}, tp6 = {0,0,0,0}, tp7 = {0,0,0,0};
|
||||
|
||||
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
|
||||
|
||||
|
||||
@@ -54,3 +54,6 @@ ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
|
||||
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
|
||||
@@ -90,14 +90,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
#ISAMAXKERNEL = ../arm/iamax.c
|
||||
#IDAMAXKERNEL = ../arm/iamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
#ICAMAXKERNEL = ../arm/izamax.c
|
||||
#IZAMAXKERNEL = ../arm/izamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
#ISAMINKERNEL = ../arm/iamin.c
|
||||
#IDAMINKERNEL = ../arm/iamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
#ICAMINKERNEL = ../arm/izamin.c
|
||||
#IZAMINKERNEL = ../arm/izamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
@@ -133,8 +133,8 @@ ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
#CROTKERNEL = ../arm/zrot.c
|
||||
#ZROTKERNEL = ../arm/zrot.c
|
||||
CROTKERNEL = zrot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
@@ -150,12 +150,12 @@ ZSWAPKERNEL = zswap.c
|
||||
#SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
#CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
#ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
#SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
#DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
#CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
#ZGEMVTKERNEL = zgemv_t_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
|
||||
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "cgemm_tcopy_macros_8_power8.S"
|
||||
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
|
||||
PROLOGUE
|
||||
@@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
addi r11, SP, 288
|
||||
stvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v21, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v22, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v23, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v24, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v25, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v26, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v27, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v28, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v29, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v30, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v31, r11, r0
|
||||
li r11, 0
|
||||
std r14, 0(SP)
|
||||
std r15, 8(SP)
|
||||
std r16, 16(SP)
|
||||
std r17, 24(SP)
|
||||
std r18, 32(SP)
|
||||
std r19, 40(SP)
|
||||
std r20, 48(SP)
|
||||
std r21, 56(SP)
|
||||
std r22, 64(SP)
|
||||
std r23, 72(SP)
|
||||
std r24, 80(SP)
|
||||
std r25, 88(SP)
|
||||
std r26, 96(SP)
|
||||
std r27, 104(SP)
|
||||
std r28, 112(SP)
|
||||
std r29, 120(SP)
|
||||
std r30, 128(SP)
|
||||
std r31, 136(SP)
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
@@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11, SP, 288
|
||||
lvx v20, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v21, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v22, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v23, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v24, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v25, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v26, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v27, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v28, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v29, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v30, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v31, r11, r3
|
||||
li r11, 0
|
||||
ld r14, 0(SP)
|
||||
ld r15, 8(SP)
|
||||
ld r16, 16(SP)
|
||||
ld r17, 24(SP)
|
||||
ld r18, 32(SP)
|
||||
ld r19, 40(SP)
|
||||
ld r20, 48(SP)
|
||||
ld r21, 56(SP)
|
||||
ld r22, 64(SP)
|
||||
ld r23, 72(SP)
|
||||
ld r24, 80(SP)
|
||||
ld r25, 88(SP)
|
||||
ld r26, 96(SP)
|
||||
ld r27, 104(SP)
|
||||
ld r28, 112(SP)
|
||||
ld r29, 120(SP)
|
||||
ld r30, 128(SP)
|
||||
ld r31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
@@ -109,81 +109,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "dgemm_ncopy_macros_4_power8.S"
|
||||
|
||||
#define STACKSIZE 384
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
//addi SP, SP, -208
|
||||
|
||||
li r0, 0
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
std r14, 0(SP)
|
||||
std r15, 8(SP)
|
||||
std r16, 16(SP)
|
||||
std r17, 24(SP)
|
||||
std r18, 32(SP)
|
||||
std r19, 40(SP)
|
||||
std r20, 48(SP)
|
||||
std r21, 56(SP)
|
||||
std r22, 64(SP)
|
||||
std r23, 72(SP)
|
||||
std r24, 80(SP)
|
||||
std r25, 88(SP)
|
||||
std r26, 96(SP)
|
||||
std r27, 104(SP)
|
||||
std r28, 112(SP)
|
||||
std r29, 120(SP)
|
||||
std r30, 128(SP)
|
||||
std r31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
addi r11,SP,288
|
||||
stvx v20, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v21, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v22, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v23, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v24, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v25, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v26, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v27, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v28, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v29, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v30, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v31, r11,r0
|
||||
li r11,0
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
cmpwi cr0, N, 0
|
||||
@@ -191,10 +146,8 @@ li r11,0
|
||||
|
||||
slwi LDA, LDA, BASE_SHIFT
|
||||
|
||||
//li PREA, 384
|
||||
//li PREB, 384
|
||||
li PREA, 576
|
||||
li PREB, 576
|
||||
li PREA, 384
|
||||
li PREB, 384
|
||||
|
||||
|
||||
li o8, 8
|
||||
@@ -210,70 +163,24 @@ li r11,0
|
||||
|
||||
L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11,SP,288
|
||||
lvx v20, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v21, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v22, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v23, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v24, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v25, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v26, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v27, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v28, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v29, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v30, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v31, r11,r3
|
||||
li r11,0
|
||||
ld r14, 0(SP)
|
||||
ld r15, 8(SP)
|
||||
ld r16, 16(SP)
|
||||
ld r17, 24(SP)
|
||||
ld r18, 32(SP)
|
||||
ld r19, 40(SP)
|
||||
ld r20, 48(SP)
|
||||
ld r21, 56(SP)
|
||||
ld r22, 64(SP)
|
||||
ld r23, 72(SP)
|
||||
ld r24, 80(SP)
|
||||
ld r25, 88(SP)
|
||||
ld r26, 96(SP)
|
||||
ld r27, 104(SP)
|
||||
ld r28, 112(SP)
|
||||
ld r29, 120(SP)
|
||||
ld r30, 128(SP)
|
||||
ld r31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
//addi SP, SP, 208
|
||||
|
||||
@@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.macro COPY_4x16
|
||||
|
||||
lxvd2x vs0, o0, A0
|
||||
lxvd2x vs8, o0, A1
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs1, o0, A1
|
||||
lxvd2x vs2, o0, A2
|
||||
lxvd2x vs3, o0, A3
|
||||
|
||||
lxvd2x vs1, o16, A0
|
||||
lxvd2x vs9, o16, A1
|
||||
lxvd2x vs17, o16, A2
|
||||
lxvd2x vs25, o16, A3
|
||||
lxvd2x vs4, o16, A0
|
||||
lxvd2x vs5, o16, A1
|
||||
lxvd2x vs6, o16, A2
|
||||
lxvd2x vs7, o16, A3
|
||||
|
||||
lxvd2x vs2, o32, A0
|
||||
lxvd2x vs10, o32, A1
|
||||
lxvd2x vs18, o32, A2
|
||||
lxvd2x vs26, o32, A3
|
||||
xxpermdi vs32, vs0, vs1, 0
|
||||
xxpermdi vs33, vs2, vs3, 0
|
||||
xxpermdi vs34, vs0, vs1, 3
|
||||
xxpermdi vs35, vs2, vs3, 3
|
||||
|
||||
lxvd2x vs3, o48, A0
|
||||
lxvd2x vs11, o48, A1
|
||||
lxvd2x vs19, o48, A2
|
||||
lxvd2x vs27, o48, A3
|
||||
xxpermdi vs36, vs4, vs5, 0
|
||||
xxpermdi vs37, vs6, vs7, 0
|
||||
xxpermdi vs38, vs4, vs5, 3
|
||||
xxpermdi vs39, vs6, vs7, 3
|
||||
|
||||
lxvd2x vs4, o64, A0
|
||||
lxvd2x vs12, o64, A1
|
||||
lxvd2x vs20, o64, A2
|
||||
lxvd2x vs28, o64, A3
|
||||
lxvd2x vs0, o32, A0
|
||||
lxvd2x vs1, o32, A1
|
||||
lxvd2x vs2, o32, A2
|
||||
lxvd2x vs3, o32, A3
|
||||
|
||||
lxvd2x vs5, o80, A0
|
||||
lxvd2x vs13, o80, A1
|
||||
lxvd2x vs21, o80, A2
|
||||
lxvd2x vs29, o80, A3
|
||||
|
||||
lxvd2x vs6, o96, A0
|
||||
lxvd2x vs14, o96, A1
|
||||
lxvd2x vs22, o96, A2
|
||||
lxvd2x vs30, o96, A3
|
||||
|
||||
lxvd2x vs7, o112, A0
|
||||
lxvd2x vs15, o112, A1
|
||||
lxvd2x vs23, o112, A2
|
||||
lxvd2x vs31, o112, A3
|
||||
lxvd2x vs4, o48, A0
|
||||
lxvd2x vs5, o48, A1
|
||||
lxvd2x vs6, o48, A2
|
||||
lxvd2x vs7, o48, A3
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
xxpermdi vs40, vs0, vs1, 0
|
||||
xxpermdi vs41, vs2, vs3, 0
|
||||
xxpermdi vs42, vs0, vs1, 3
|
||||
xxpermdi vs43, vs2, vs3, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
xxpermdi vs44, vs4, vs5, 0
|
||||
xxpermdi vs45, vs6, vs7, 0
|
||||
xxpermdi vs46, vs4, vs5, 3
|
||||
xxpermdi vs47, vs6, vs7, 3
|
||||
|
||||
xxpermdi vs40, vs2, vs10, 0
|
||||
xxpermdi vs41, vs18, vs26, 0
|
||||
xxpermdi vs42, vs2, vs10, 3
|
||||
xxpermdi vs43, vs18, vs26, 3
|
||||
lxvd2x vs0, o64, A0
|
||||
lxvd2x vs1, o64, A1
|
||||
lxvd2x vs2, o64, A2
|
||||
lxvd2x vs3, o64, A3
|
||||
|
||||
xxpermdi vs44, vs3, vs11, 0
|
||||
xxpermdi vs45, vs19, vs27, 0
|
||||
xxpermdi vs46, vs3, vs11, 3
|
||||
xxpermdi vs47, vs19, vs27, 3
|
||||
lxvd2x vs4, o80, A0
|
||||
lxvd2x vs5, o80, A1
|
||||
lxvd2x vs6, o80, A2
|
||||
lxvd2x vs7, o80, A3
|
||||
|
||||
xxpermdi vs48, vs4, vs12, 0
|
||||
xxpermdi vs49, vs20, vs28, 0
|
||||
xxpermdi vs50, vs4, vs12, 3
|
||||
xxpermdi vs51, vs20, vs28, 3
|
||||
|
||||
xxpermdi vs52, vs5, vs13, 0
|
||||
xxpermdi vs53, vs21, vs29, 0
|
||||
xxpermdi vs54, vs5, vs13, 3
|
||||
xxpermdi vs55, vs21, vs29, 3
|
||||
xxpermdi vs48, vs0, vs1, 0
|
||||
xxpermdi vs49, vs2, vs3, 0
|
||||
xxpermdi vs50, vs0, vs1, 3
|
||||
xxpermdi vs51, vs2, vs3, 3
|
||||
|
||||
xxpermdi vs8, vs4, vs5, 0
|
||||
xxpermdi vs9, vs6, vs7, 0
|
||||
xxpermdi vs10, vs4, vs5, 3
|
||||
xxpermdi vs11, vs6, vs7, 3
|
||||
|
||||
lxvd2x vs0, o96, A0
|
||||
lxvd2x vs1, o96, A1
|
||||
lxvd2x vs2, o96, A2
|
||||
lxvd2x vs3, o96, A3
|
||||
|
||||
|
||||
lxvd2x vs6, o112, A0
|
||||
lxvd2x vs7, o112, A1
|
||||
lxvd2x vs12, o112, A2
|
||||
lxvd2x vs13, o112, A3
|
||||
|
||||
|
||||
xxpermdi vs4, vs0, vs1, 0
|
||||
xxpermdi vs5, vs2, vs3, 0
|
||||
xxpermdi vs0, vs0, vs1, 3
|
||||
xxpermdi vs2, vs2, vs3, 3
|
||||
|
||||
|
||||
addi A0, A0, 128
|
||||
addi A1, A1, 128
|
||||
|
||||
xxpermdi vs56, vs6, vs14, 0
|
||||
xxpermdi vs57, vs22, vs30, 0
|
||||
xxpermdi vs58, vs6, vs14, 3
|
||||
xxpermdi vs59, vs22, vs30, 3
|
||||
xxpermdi vs1, vs6, vs7, 0
|
||||
xxpermdi vs3, vs12, vs13, 0
|
||||
xxpermdi vs6, vs6, vs7, 3
|
||||
xxpermdi vs12, vs12, vs13, 3
|
||||
|
||||
dcbt BO, PREB
|
||||
|
||||
addi A3, A3, 128
|
||||
addi A2, A2, 128
|
||||
|
||||
xxpermdi vs60, vs7, vs15, 0
|
||||
xxpermdi vs61, vs23, vs31, 0
|
||||
xxpermdi vs62, vs7, vs15, 3
|
||||
xxpermdi vs63, vs23, vs31, 3
|
||||
|
||||
dcbt BO, PREB
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
stxvd2x vs33, o16, BO
|
||||
stxvd2x vs34, o32, BO
|
||||
@@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stxvd2x vs49, o16, BO
|
||||
stxvd2x vs50, o32, BO
|
||||
stxvd2x vs51, o48, BO
|
||||
stxvd2x vs52, o64, BO
|
||||
stxvd2x vs53, o80, BO
|
||||
stxvd2x vs54, o96, BO
|
||||
stxvd2x vs55, o112, BO
|
||||
stxvd2x vs8, o64, BO
|
||||
stxvd2x vs9, o80, BO
|
||||
stxvd2x vs10, o96, BO
|
||||
stxvd2x vs11, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
dcbt BO, PREB
|
||||
|
||||
stxvd2x vs56, o0, BO
|
||||
stxvd2x vs57, o16, BO
|
||||
stxvd2x vs58, o32, BO
|
||||
stxvd2x vs59, o48, BO
|
||||
stxvd2x vs60, o64, BO
|
||||
stxvd2x vs61, o80, BO
|
||||
stxvd2x vs62, o96, BO
|
||||
stxvd2x vs63, o112, BO
|
||||
stxvd2x vs4, o0, BO
|
||||
stxvd2x vs5, o16, BO
|
||||
stxvd2x vs0, o32, BO
|
||||
stxvd2x vs2, o48, BO
|
||||
stxvd2x vs1, o64, BO
|
||||
stxvd2x vs3, o80, BO
|
||||
stxvd2x vs6, o96, BO
|
||||
stxvd2x vs12, o112, BO
|
||||
addi BO, BO, 128
|
||||
|
||||
|
||||
@@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi A1, A1, 64
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs17, o16, A2
|
||||
lxvd2x vs18, o32, A2
|
||||
lxvd2x vs19, o48, A2
|
||||
lxvd2x vs4, o0, A2
|
||||
lxvd2x vs5, o16, A2
|
||||
lxvd2x vs6, o32, A2
|
||||
lxvd2x vs7, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs25, o16, A3
|
||||
lxvd2x vs26, o32, A3
|
||||
lxvd2x vs27, o48, A3
|
||||
lxvd2x vs12, o0, A3
|
||||
lxvd2x vs13, o16, A3
|
||||
lxvd2x vs50, o32, A3
|
||||
lxvd2x vs51, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs33, vs4, vs12, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
xxpermdi vs35, vs4, vs12, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs37, vs5, vs13, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
xxpermdi vs39, vs5, vs13, 3
|
||||
|
||||
xxpermdi vs40, vs2, vs10, 0
|
||||
xxpermdi vs41, vs18, vs26, 0
|
||||
xxpermdi vs41, vs6, vs50, 0
|
||||
xxpermdi vs42, vs2, vs10, 3
|
||||
xxpermdi vs43, vs18, vs26, 3
|
||||
xxpermdi vs43, vs6, vs50, 3
|
||||
|
||||
xxpermdi vs44, vs3, vs11, 0
|
||||
xxpermdi vs45, vs19, vs27, 0
|
||||
xxpermdi vs45, vs7, vs51, 0
|
||||
xxpermdi vs46, vs3, vs11, 3
|
||||
xxpermdi vs47, vs19, vs27, 3
|
||||
xxpermdi vs47, vs7, vs51, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
@@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi A1, A1, 32
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs17, o16, A2
|
||||
lxvd2x vs10, o0, A2
|
||||
lxvd2x vs11, o16, A2
|
||||
addi A2, A2, 32
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs25, o16, A3
|
||||
lxvd2x vs12, o0, A3
|
||||
lxvd2x vs13, o16, A3
|
||||
addi A3, A3, 32
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs33, vs10, vs12, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
xxpermdi vs35, vs10, vs12, 3
|
||||
|
||||
xxpermdi vs36, vs1, vs9, 0
|
||||
xxpermdi vs37, vs17, vs25, 0
|
||||
xxpermdi vs37, vs11, vs13, 0
|
||||
xxpermdi vs38, vs1, vs9, 3
|
||||
xxpermdi vs39, vs17, vs25, 3
|
||||
xxpermdi vs39, vs11, vs13, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
@@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi A1, A1, 16
|
||||
|
||||
|
||||
lxvd2x vs16, o0, A2
|
||||
lxvd2x vs9, o0, A2
|
||||
addi A2, A2, 16
|
||||
|
||||
|
||||
lxvd2x vs24, o0, A3
|
||||
lxvd2x vs10, o0, A3
|
||||
addi A3, A3, 16
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs33, vs9, vs10, 0
|
||||
xxpermdi vs34, vs0, vs8, 3
|
||||
xxpermdi vs35, vs16, vs24, 3
|
||||
xxpermdi vs35, vs9, vs10, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
@@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi A1, A1, 8
|
||||
|
||||
|
||||
lxsdx vs16, o0, A2
|
||||
lxsdx vs9, o0, A2
|
||||
addi A2, A2, 8
|
||||
|
||||
|
||||
lxsdx vs24, o0, A3
|
||||
lxsdx vs10, o0, A3
|
||||
addi A3, A3, 8
|
||||
|
||||
|
||||
xxpermdi vs32, vs0, vs8, 0
|
||||
xxpermdi vs33, vs16, vs24, 0
|
||||
xxpermdi vs33, vs9, vs10, 0
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
@@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
lxvd2x vs11, o48, A1
|
||||
lxvd2x vs12, o64, A1
|
||||
lxvd2x vs13, o80, A1
|
||||
lxvd2x vs14, o96, A1
|
||||
lxvd2x vs15, o112, A1
|
||||
lxvd2x vs48, o96, A1
|
||||
lxvd2x vs49, o112, A1
|
||||
addi A1, A1, 128
|
||||
|
||||
|
||||
@@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
xxpermdi vs42, vs5, vs13, 0
|
||||
xxpermdi vs43, vs5, vs13, 3
|
||||
|
||||
xxpermdi vs44, vs6, vs14, 0
|
||||
xxpermdi vs45, vs6, vs14, 3
|
||||
xxpermdi vs44, vs6, vs48, 0
|
||||
xxpermdi vs45, vs6, vs48, 3
|
||||
|
||||
xxpermdi vs46, vs7, vs15, 0
|
||||
xxpermdi vs47, vs7, vs15, 3
|
||||
xxpermdi vs46, vs7, vs49, 0
|
||||
xxpermdi vs47, vs7, vs49, 3
|
||||
|
||||
|
||||
stxvd2x vs32, o0, BO
|
||||
|
||||
@@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "dgemm_tcopy_macros_16_power8.S"
|
||||
|
||||
#define STACKSIZE 384
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
//addi SP, SP, -208
|
||||
|
||||
li r0, 0
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
addi r11,SP,288
|
||||
stvx v20, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v21, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v22, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v23, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v24, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v25, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v26, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v27, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v28, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v29, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v30, r11,r0
|
||||
addi r11,r11,16
|
||||
stvx v31, r11,r0
|
||||
li r11,0
|
||||
std r14,0(SP)
|
||||
std r15,8(SP)
|
||||
std r16,16(SP)
|
||||
std r17,24(SP)
|
||||
std r18,32(SP)
|
||||
std r19,40(SP)
|
||||
std r20,48(SP)
|
||||
std r21,56(SP)
|
||||
std r22,64(SP)
|
||||
std r23,72(SP)
|
||||
std r24,80(SP)
|
||||
std r25,88(SP)
|
||||
std r26,96(SP)
|
||||
std r27,104(SP)
|
||||
std r28,112(SP)
|
||||
std r29,120(SP)
|
||||
std r30,128(SP)
|
||||
std r31,136(SP)
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
@@ -198,8 +172,7 @@ li r11,0
|
||||
add B2, B2, B
|
||||
add B1, B1, B
|
||||
|
||||
//li PREA, 384
|
||||
li PREA, 576
|
||||
li PREA, 384
|
||||
addi PREB, M16, 128
|
||||
|
||||
li o8, 8
|
||||
@@ -213,52 +186,27 @@ L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11,SP,288
|
||||
lvx v20, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v21, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v22, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v23, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v24, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v25, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v26, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v27, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v28, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v29, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v30, r11,r3
|
||||
addi r11,r11,16
|
||||
lvx v31, r11,r3
|
||||
li r11,0
|
||||
ld r14,0(SP)
|
||||
ld r15,8(SP)
|
||||
ld r16,16(SP)
|
||||
ld r17,24(SP)
|
||||
ld r18,32(SP)
|
||||
ld r19,40(SP)
|
||||
ld r20,48(SP)
|
||||
ld r21,56(SP)
|
||||
ld r22,64(SP)
|
||||
ld r23,72(SP)
|
||||
ld r24,80(SP)
|
||||
ld r25,88(SP)
|
||||
ld r26,96(SP)
|
||||
ld r27,104(SP)
|
||||
ld r28,112(SP)
|
||||
ld r29,120(SP)
|
||||
ld r30,128(SP)
|
||||
ld r31,136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
//addi SP, SP, 208
|
||||
|
||||
blr
|
||||
EPILOGUE
|
||||
|
||||
|
||||
@@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
lxvd2x vs51, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
lxvd2x vs56, o0, A3
|
||||
lxvd2x vs57, o16, A3
|
||||
lxvd2x vs58, o32, A3
|
||||
lxvd2x vs59, o48, A3
|
||||
lxvd2x vs4, o0, A3
|
||||
lxvd2x vs5, o16, A3
|
||||
lxvd2x vs6, o32, A3
|
||||
lxvd2x vs7, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
lxvd2x vs36, o0, A0
|
||||
@@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
lxvd2x vs47, o48, A1
|
||||
addi A1, A1, 64
|
||||
|
||||
lxvd2x vs52, o0, A2
|
||||
lxvd2x vs53, o16, A2
|
||||
lxvd2x vs54, o32, A2
|
||||
lxvd2x vs55, o48, A2
|
||||
lxvd2x vs12, o0, A2
|
||||
lxvd2x vs13, o16, A2
|
||||
lxvd2x vs2, o32, A2
|
||||
lxvd2x vs3, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
lxvd2x vs60, o0, A3
|
||||
lxvd2x vs61, o16, A3
|
||||
lxvd2x vs62, o32, A3
|
||||
lxvd2x vs63, o48, A3
|
||||
lxvd2x vs8, o0, A3
|
||||
lxvd2x vs9, o16, A3
|
||||
lxvd2x vs10, o32, A3
|
||||
lxvd2x vs11, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
mr T1, BO
|
||||
@@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stxvd2x vs51, o48, T1
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs52, o0, T1
|
||||
stxvd2x vs53, o16, T1
|
||||
stxvd2x vs54, o32, T1
|
||||
stxvd2x vs55, o48, T1
|
||||
stxvd2x vs12, o0, T1
|
||||
stxvd2x vs13, o16, T1
|
||||
stxvd2x vs2, o32, T1
|
||||
stxvd2x vs3, o48, T1
|
||||
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs56, o0, T1
|
||||
stxvd2x vs57, o16, T1
|
||||
stxvd2x vs58, o32, T1
|
||||
stxvd2x vs59, o48, T1
|
||||
stxvd2x vs4, o0, T1
|
||||
stxvd2x vs5, o16, T1
|
||||
stxvd2x vs6, o32, T1
|
||||
stxvd2x vs7, o48, T1
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs60, o0, T1
|
||||
stxvd2x vs61, o16, T1
|
||||
stxvd2x vs62, o32, T1
|
||||
stxvd2x vs63, o48, T1
|
||||
stxvd2x vs8, o0, T1
|
||||
stxvd2x vs9, o16, T1
|
||||
stxvd2x vs10, o32, T1
|
||||
stxvd2x vs11, o48, T1
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
886
kernel/power/dgemv_t.c
Normal file
886
kernel/power/dgemv_t.c
Normal file
@@ -0,0 +1,886 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 8192
|
||||
#define PREFETCH 1
|
||||
#include <altivec.h>
|
||||
|
||||
#define HAVE_KERNEL4x8_ASM 1
|
||||
|
||||
|
||||
#if defined(HAVE_KERNEL4x8_ASM)
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) {
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
BLASLONG off2;
|
||||
BLASLONG tempR;
|
||||
__asm__(
|
||||
|
||||
"sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2
|
||||
"sldi %[off], %[off], 3 \n\t" // lda * sizeof (double)
|
||||
"xxlxor 34,34,34 \n\t"
|
||||
"xxlxor 35,34,34 \n\t"
|
||||
"add %[a2], %[a0], %[temp] \n\t"
|
||||
"add %[a1], %[a0], %[off] \n\t"
|
||||
"xxlxor 4,34,34 \n\t"
|
||||
"xxlxor 5,34,34 \n\t"
|
||||
"xxlxor 6,34,34 \n\t"
|
||||
"xxlxor 7,34,34 \n\t"
|
||||
"add %[a3], %[a2], %[off] \n\t"
|
||||
"add %[a4], %[a2], %[temp] \n\t"
|
||||
|
||||
"xxlxor 8,34,34 \n\t"
|
||||
"xxlxor 9,34,34 \n\t"
|
||||
"add %[a5], %[a3], %[temp] \n\t"
|
||||
"li %[off],0 \n\t"
|
||||
"li %[off2],16 \n\t"
|
||||
|
||||
"add %[a6], %[a4], %[temp] \n\t"
|
||||
"add %[a7], %[a5], %[temp] \n\t"
|
||||
|
||||
|
||||
|
||||
|
||||
"lxvd2x 32, %[x], %[off] \n\t"
|
||||
"lxvd2x 36, %[a0], %[off] \n\t"
|
||||
"lxvd2x 38, %[a1], %[off] \n\t"
|
||||
"lxvd2x 40, %[a2], %[off] \n\t"
|
||||
"lxvd2x 42, %[a3], %[off] \n\t"
|
||||
"lxvd2x 44, %[a4], %[off] \n\t"
|
||||
"lxvd2x 46, %[a5], %[off] \n\t"
|
||||
"lxvd2x 48, %[a6], %[off] \n\t"
|
||||
"lxvd2x 50, %[a7], %[off] \n\t"
|
||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||
"lxvd2x 37, %[a0], %[off2] \n\t"
|
||||
"lxvd2x 39, %[a1], %[off2] \n\t"
|
||||
"lxvd2x 41, %[a2], %[off2] \n\t"
|
||||
"lxvd2x 43, %[a3], %[off2] \n\t"
|
||||
"lxvd2x 45, %[a4], %[off2] \n\t"
|
||||
"lxvd2x 47, %[a5], %[off2] \n\t"
|
||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"li %[temp],896 \n\t"
|
||||
#endif
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
|
||||
"li %[off],32 \n\t"
|
||||
|
||||
|
||||
"ble- 2f \n\t"
|
||||
|
||||
//--------------------------------------------------
|
||||
".p2align 5 \n\t"
|
||||
"1: \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"lxvd2x 36, %[a0], %[off] \n\t"
|
||||
"lxvd2x 38, %[a1], %[off] \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"lxvd2x 40, %[a2], %[off] \n\t"
|
||||
"lxvd2x 42, %[a3], %[off] \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"lxvd2x 44, %[a4], %[off] \n\t"
|
||||
"lxvd2x 46, %[a5], %[off] \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
"lxvd2x 48, %[a6], %[off] \n\t"
|
||||
"lxvd2x 50, %[a7], %[off] \n\t"
|
||||
"lxvd2x 32, %[x], %[off] \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvd2x 37, %[a0], %[off2] \n\t"
|
||||
"lxvd2x 39, %[a1], %[off2] \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvd2x 41, %[a2], %[off2] \n\t"
|
||||
"lxvd2x 43, %[a3], %[off2] \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvd2x 45, %[a4], %[off2] \n\t"
|
||||
"lxvd2x 47, %[a5], %[off2] \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||
"ble- 2f \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"lxvd2x 36, %[a0], %[off] \n\t"
|
||||
"lxvd2x 38, %[a1], %[off] \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"lxvd2x 40, %[a2], %[off] \n\t"
|
||||
"lxvd2x 42, %[a3], %[off] \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"lxvd2x 44, %[a4], %[off] \n\t"
|
||||
"lxvd2x 46, %[a5], %[off] \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
"lxvd2x 48, %[a6], %[off] \n\t"
|
||||
"lxvd2x 50, %[a7], %[off] \n\t"
|
||||
"lxvd2x 32, %[x], %[off] \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvd2x 37, %[a0], %[off2] \n\t"
|
||||
"lxvd2x 39, %[a1], %[off2] \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvd2x 41, %[a2], %[off2] \n\t"
|
||||
"lxvd2x 43, %[a3], %[off2] \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvd2x 45, %[a4], %[off2] \n\t"
|
||||
"lxvd2x 47, %[a5], %[off2] \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||
"ble- 2f \n\t"
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"addi %[temp],%[temp],128 \n\t"
|
||||
#endif
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
"lxvd2x 36, %[a0], %[off] \n\t"
|
||||
"lxvd2x 38, %[a1], %[off] \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"lxvd2x 40, %[a2], %[off] \n\t"
|
||||
"lxvd2x 42, %[a3], %[off] \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"lxvd2x 44, %[a4], %[off] \n\t"
|
||||
"lxvd2x 46, %[a5], %[off] \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
"lxvd2x 48, %[a6], %[off] \n\t"
|
||||
"lxvd2x 50, %[a7], %[off] \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a0] \n\t"
|
||||
#endif
|
||||
"lxvd2x 32, %[x], %[off] \n\t"
|
||||
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvd2x 37, %[a0], %[off2] \n\t"
|
||||
"lxvd2x 39, %[a1], %[off2] \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a1] \n\t"
|
||||
#endif
|
||||
"lxvd2x 41, %[a2], %[off2] \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvd2x 43, %[a3], %[off2] \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvd2x 45, %[a4], %[off2] \n\t"
|
||||
"lxvd2x 47, %[a5], %[off2] \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a3] \n\t"
|
||||
#endif
|
||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||
|
||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"ble- 2f \n\t"
|
||||
|
||||
"addi %[off2], %[off2],32 \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a2] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"lxvd2x 36, %[a0], %[off] \n\t"
|
||||
"lxvd2x 38, %[a1], %[off] \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"lxvd2x 40, %[a2], %[off] \n\t"
|
||||
"lxvd2x 42, %[a3], %[off] \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a4] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"lxvd2x 44, %[a4], %[off] \n\t"
|
||||
"lxvd2x 46, %[a5], %[off] \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
"lxvd2x 48, %[a6], %[off] \n\t"
|
||||
"lxvd2x 50, %[a7], %[off] \n\t"
|
||||
"lxvd2x 32, %[x], %[off] \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a5] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"lxvd2x 37, %[a0], %[off2] \n\t"
|
||||
"lxvd2x 39, %[a1], %[off2] \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"addi %[off], %[off],32 \n\t"
|
||||
"lxvd2x 41, %[a2], %[off2] \n\t"
|
||||
"lxvd2x 43, %[a3], %[off2] \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a6] \n\t"
|
||||
#endif
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"lxvd2x 45, %[a4], %[off2] \n\t"
|
||||
"lxvd2x 47, %[a5], %[off2] \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[a7] \n\t"
|
||||
#endif
|
||||
"lxvd2x 49, %[a6], %[off2] \n\t"
|
||||
"addic. %[n],%[n],-4 \n\t"
|
||||
"lxvd2x 51, %[a7], %[off2] \n\t"
|
||||
"lxvd2x 33, %[x], %[off2] \n\t"
|
||||
#if defined(PREFETCH)
|
||||
"dcbt %[temp],%[x] \n\t"
|
||||
#endif
|
||||
"bgt+ 1b \n\t"
|
||||
".p2align 5 \n\t"
|
||||
"2: \n\t"
|
||||
//--------------------------------------------
|
||||
|
||||
"xvmaddadp 34,36,32 \n\t"
|
||||
"xvmaddadp 35,38,32 \n\t"
|
||||
"xvmaddadp 4,40,32 \n\t"
|
||||
"xvmaddadp 5,42,32 \n\t"
|
||||
"xvmaddadp 6,44,32 \n\t"
|
||||
"xvmaddadp 7,46,32 \n\t"
|
||||
"xvmaddadp 8,48,32 \n\t"
|
||||
"xvmaddadp 9,50,32 \n\t"
|
||||
"xxspltd 36, %x[alpha], 0 \n\t"
|
||||
"xvmaddadp 34,37,33 \n\t"
|
||||
"xvmaddadp 35,39,33 \n\t"
|
||||
"xvmaddadp 4,41,33 \n\t"
|
||||
"xvmaddadp 5,43,33 \n\t"
|
||||
"xvmaddadp 6,45,33 \n\t"
|
||||
"xvmaddadp 7,47,33 \n\t"
|
||||
"xvmaddadp 8,49,33 \n\t"
|
||||
"xvmaddadp 9,51,33 \n\t"
|
||||
|
||||
"lxvd2x 37, 0, %[y] \n\t"
|
||||
"li %[off2],16 \n\t"
|
||||
"lxvd2x 38, %[off2], %[y] \n\t"
|
||||
|
||||
"li %[off2],32 \n\t"
|
||||
"lxvd2x 39, %[off2], %[y] \n\t"
|
||||
"li %[off2],48 \n\t"
|
||||
"lxvd2x 40, %[off2], %[y] \n\t"
|
||||
|
||||
|
||||
|
||||
"xxmrgld 42,34,35 \n\t"
|
||||
"xxmrghd 43,34,35 \n\t"
|
||||
|
||||
"xxmrgld 44,4,5 \n\t"
|
||||
"xxmrghd 45,4,5 \n\t"
|
||||
|
||||
"xvadddp 42,42,43 \n\t"
|
||||
|
||||
"xxmrgld 46,6,7 \n\t"
|
||||
"xxmrghd 47,6,7 \n\t"
|
||||
|
||||
"xvadddp 44,44,45 \n\t"
|
||||
|
||||
"xxmrgld 48,8,9 \n\t"
|
||||
"xxmrghd 49,8,9 \n\t"
|
||||
|
||||
"xvadddp 46,46,47 \n\t"
|
||||
|
||||
"xvmaddadp 37,42,36 \n\t"
|
||||
"xvmaddadp 38,44,36 \n\t"
|
||||
|
||||
"xvadddp 48,48,49 \n\t"
|
||||
|
||||
"xvmaddadp 39,46,36 \n\t"
|
||||
|
||||
"stxvd2x 37, 0, %[y] \n\t"
|
||||
"li %[off],16 \n\t"
|
||||
"stxvd2x 38, %[off], %[y] \n\t"
|
||||
"xvmaddadp 40,48,36 \n\t"
|
||||
"li %[off],32 \n\t"
|
||||
"stxvd2x 39, %[off], %[y] \n\t"
|
||||
"stxvd2x 40, %[off2], %[y] \n\t"
|
||||
|
||||
: [memy] "+m" (*(const double (*)[8])y),
|
||||
[n] "+&r" (n),
|
||||
[a0] "=b" (a0),
|
||||
[a1] "=&b" (a1),
|
||||
[a2] "=&b" (a2),
|
||||
[a3] "=&b" (a3),
|
||||
[a4] "=&b" (a4),
|
||||
[a5] "=&b" (a5),
|
||||
[a6] "=&b" (a6),
|
||||
[a7] "=&b" (a7),
|
||||
[off] "+&b" (lda),
|
||||
[off2]"=&b" (off2),
|
||||
[temp] "=&b" (tempR)
|
||||
: [memx] "m" (*(const double (*)[n])x),
|
||||
[mem_ap] "m" (*(const double (*)[]) ap),
|
||||
[alpha] "d" (alpha),
|
||||
"[a0]" (ap),
|
||||
[x] "b" (x),
|
||||
[y] "b" (y)
|
||||
: "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39",
|
||||
"vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i;
|
||||
#if defined(PREFETCH)
|
||||
BLASLONG j, c, k;
|
||||
#endif
|
||||
FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7;
|
||||
__vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x;
|
||||
register __vector double temp0 = {0, 0};
|
||||
register __vector double temp1 = {0, 0};
|
||||
register __vector double temp2 = {0, 0};
|
||||
register __vector double temp3 = {0, 0};
|
||||
register __vector double temp4 = {0, 0};
|
||||
register __vector double temp5 = {0, 0};
|
||||
register __vector double temp6 = {0, 0};
|
||||
register __vector double temp7 = {0, 0};
|
||||
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
a4 = a3 + lda;
|
||||
a5 = a4 + lda;
|
||||
a6 = a5 + lda;
|
||||
a7 = a6 + lda;
|
||||
va0 = (__vector double*) a0;
|
||||
va1 = (__vector double*) a1;
|
||||
va2 = (__vector double*) a2;
|
||||
va3 = (__vector double*) a3;
|
||||
va4 = (__vector double*) a4;
|
||||
va5 = (__vector double*) a5;
|
||||
va6 = (__vector double*) a6;
|
||||
va7 = (__vector double*) a7;
|
||||
v_x = (__vector double*) x;
|
||||
|
||||
#if defined(PREFETCH)
|
||||
|
||||
c = n >> 1;
|
||||
|
||||
for (j = 0; j < c; j += 64) {
|
||||
k = (c - j) > 64 ? 64 : (c - j);
|
||||
__builtin_prefetch(v_x + 64);
|
||||
__builtin_prefetch(va0 + 64);
|
||||
__builtin_prefetch(va1 + 64);
|
||||
__builtin_prefetch(va2 + 64);
|
||||
__builtin_prefetch(va3 + 64);
|
||||
__builtin_prefetch(va4 + 64);
|
||||
__builtin_prefetch(va5 + 64);
|
||||
__builtin_prefetch(va6 + 64);
|
||||
__builtin_prefetch(va7 + 64);
|
||||
for (i = 0; i < k; i += 2) {
|
||||
#else
|
||||
|
||||
for (i = 0; i < n/2; i += 2) {
|
||||
#endif
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
temp4 += v_x[i] * va4[i];
|
||||
temp5 += v_x[i] * va5[i];
|
||||
temp6 += v_x[i] * va6[i];
|
||||
temp7 += v_x[i] * va7[i];
|
||||
temp0 += v_x[i + 1] * va0[i + 1];
|
||||
temp1 += v_x[i + 1] * va1[i + 1];
|
||||
temp2 += v_x[i + 1] * va2[i + 1];
|
||||
temp3 += v_x[i + 1] * va3[i + 1];
|
||||
|
||||
temp4 += v_x[i + 1] * va4[i + 1];
|
||||
temp5 += v_x[i + 1] * va5[i + 1];
|
||||
temp6 += v_x[i + 1] * va6[i + 1];
|
||||
temp7 += v_x[i + 1] * va7[i + 1];
|
||||
}
|
||||
#if defined(PREFETCH)
|
||||
va0 += 64;
|
||||
va1 += 64;
|
||||
va2 += 64;
|
||||
va3 += 64;
|
||||
va4 += 64;
|
||||
va5 += 64;
|
||||
va6 += 64;
|
||||
va7 += 64;
|
||||
v_x += 64;
|
||||
|
||||
}
|
||||
#endif
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]);
|
||||
|
||||
y[4] += alpha * (temp4[0] + temp4[1]);
|
||||
y[5] += alpha * (temp5[0] + temp5[1]);
|
||||
y[6] += alpha * (temp6[0] + temp6[1]);
|
||||
y[7] += alpha * (temp7[0] + temp7[1]);
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
BLASLONG i = 0;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* va1 = (__vector double*) a1;
|
||||
__vector double* va2 = (__vector double*) a2;
|
||||
__vector double* va3 = (__vector double*) a3;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
register __vector double temp0 = {0, 0};
|
||||
register __vector double temp1 = {0, 0};
|
||||
register __vector double temp2 = {0, 0};
|
||||
register __vector double temp3 = {0, 0};
|
||||
register __vector double temp4 = {0, 0};
|
||||
register __vector double temp5 = {0, 0};
|
||||
register __vector double temp6 = {0, 0};
|
||||
register __vector double temp7 = {0, 0};
|
||||
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i];
|
||||
temp1 += v_x[i] * va1[i];
|
||||
temp2 += v_x[i] * va2[i];
|
||||
temp3 += v_x[i] * va3[i];
|
||||
temp4 += v_x[i + 1] * va0[i + 1];
|
||||
temp5 += v_x[i + 1] * va1[i + 1];
|
||||
temp6 += v_x[i + 1] * va2[i + 1];
|
||||
temp7 += v_x[i + 1] * va3[i + 1];
|
||||
}
|
||||
|
||||
temp0 += temp4;
|
||||
temp1 += temp5;
|
||||
temp2 += temp6;
|
||||
temp3 += temp7;
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[1] += alpha * (temp1[0] + temp1[1]);
|
||||
y[2] += alpha * (temp2[0] + temp2[1]);
|
||||
y[3] += alpha * (temp3[0] + temp3[1]);
|
||||
|
||||
}
|
||||
|
||||
|
||||
static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* va1 = (__vector double*) a1;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
__vector double temp0 = {0, 0};
|
||||
__vector double temp1 = {0, 0};
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
|
||||
temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1];
|
||||
}
|
||||
|
||||
|
||||
|
||||
y[0] += alpha * (temp0[0] + temp0[1]);
|
||||
y[inc_y] += alpha * (temp1[0] + temp1[1]);
|
||||
}
|
||||
|
||||
static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) {
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
__vector double* va0 = (__vector double*) a0;
|
||||
__vector double* v_x = (__vector double*) x;
|
||||
__vector double temp0 = {0, 0};
|
||||
for (i = 0; i < n / 2; i += 2) {
|
||||
temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1];
|
||||
}
|
||||
|
||||
*y += alpha * (temp0[0] + temp0[1]);
|
||||
|
||||
}
|
||||
|
||||
static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest++ = *src;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 3;
|
||||
n2 = n & 7;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 1)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
BLASLONG lda8 = lda << 3;
|
||||
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha);
|
||||
|
||||
y_ptr += 8;
|
||||
a_ptr += lda8;
|
||||
#if defined(PREFETCH)
|
||||
__builtin_prefetch(y_ptr+64);
|
||||
#endif
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
ybuffer[4] = 0;
|
||||
ybuffer[5] = 0;
|
||||
ybuffer[6] = 0;
|
||||
ybuffer[7] = 0;
|
||||
dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
|
||||
*y_ptr += ybuffer[4];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[6];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
a_ptr += lda8;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (n2 & 4) {
|
||||
ybuffer[0] = 0;
|
||||
ybuffer[1] = 0;
|
||||
ybuffer[2] = 0;
|
||||
ybuffer[3] = 0;
|
||||
dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha);
|
||||
|
||||
a_ptr += lda<<2;
|
||||
|
||||
*y_ptr += ybuffer[0];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[2];
|
||||
y_ptr += inc_y;
|
||||
*y_ptr += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 2 * inc_y;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha);
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
a += NB;
|
||||
x += NB * inc_x;
|
||||
|
||||
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
a_ptr = a;
|
||||
if (m3 == 3) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp2 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 3 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2;
|
||||
y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2;
|
||||
y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
|
||||
aj += 12;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
|
||||
aj += 3;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
FLOAT xtemp0 = *x_ptr * alpha;
|
||||
x_ptr += inc_x;
|
||||
FLOAT xtemp1 = *x_ptr * alpha;
|
||||
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
|
||||
if (lda == 2 && inc_y == 1) {
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1;
|
||||
y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1;
|
||||
y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1;
|
||||
aj += 8;
|
||||
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1;
|
||||
aj += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
|
||||
y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
FLOAT xtemp = *x_ptr * alpha;
|
||||
FLOAT *aj = a_ptr;
|
||||
y_ptr = y;
|
||||
if (lda == 1 && inc_y == 1) {
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
y_ptr[j + 1] += aj[j + 1] * xtemp;
|
||||
y_ptr[j + 2] += aj[j + 2] * xtemp;
|
||||
y_ptr[j + 3] += aj[j + 3] * xtemp;
|
||||
}
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += aj[j] * xtemp;
|
||||
}
|
||||
|
||||
|
||||
} else {
|
||||
if (inc_y == 1) {
|
||||
|
||||
BLASLONG register lda2 = lda << 1;
|
||||
BLASLONG register lda4 = lda << 2;
|
||||
BLASLONG register lda3 = lda2 + lda;
|
||||
for (j = 0; j < (n & -4); j += 4) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
y_ptr[j + 1] += *(aj + lda) * xtemp;
|
||||
y_ptr[j + 2] += *(aj + lda2) * xtemp;
|
||||
y_ptr[j + 3] += *(aj + lda3) * xtemp;
|
||||
aj += lda4;
|
||||
}
|
||||
|
||||
for (; j < n; j++) {
|
||||
y_ptr[j] += *aj * xtemp;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
} else {
|
||||
for (j = 0; j < n; j++) {
|
||||
*y_ptr += *aj * xtemp;
|
||||
y_ptr += inc_y;
|
||||
aj += lda;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
383
kernel/power/idamax.c
Normal file
383
kernel/power/idamax.c
Normal file
@@ -0,0 +1,383 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include <altivec.h>
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
BLASLONG index;
|
||||
register __vector long long start = {1,0};
|
||||
register __vector long long temp_add_index = {2, 2};
|
||||
__asm__(
|
||||
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
|
||||
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
|
||||
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index
|
||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value
|
||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||
"xxspltd 36,36,0 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//jump first half forward
|
||||
"b 2f \n\t"
|
||||
|
||||
//===================================================================
|
||||
|
||||
".p2align 5 \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"xvcmpgtdp 2,45,44 \n\t "
|
||||
"xvcmpgtdp 3,47,46 \n\t "
|
||||
"xvcmpgtdp 4,49,48 \n\t "
|
||||
"xvcmpgtdp 5,51,50 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgtdp 2, 1,0 \n\t"
|
||||
"xvcmpgtdp 3,47, 45 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
//choose bigger from first and second part
|
||||
"xvcmpgtdp 4,5 , 0 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
||||
"xvcmpgtdp 2, 3,39 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//<-----------jump here from first load
|
||||
"2: \n\t"
|
||||
|
||||
"xvcmpgtdp 2,45,44 \n\t "
|
||||
"xvcmpgtdp 3,47,46 \n\t "
|
||||
"xvcmpgtdp 4,49,48 \n\t "
|
||||
"xvcmpgtdp 5,51,50 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgtdp 2, 1,0 \n\t"
|
||||
"xvcmpgtdp 3,47, 45 \n\t"
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
//choose bigger from first and second part
|
||||
"xvcmpgtdp 4,5 , 0 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
|
||||
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
||||
"xvcmpgtdp 2, 3,39 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//decrement n
|
||||
"addic. %[n], %[n], -32 \n\t"
|
||||
|
||||
//Loop back if >0
|
||||
"bgt+ 1b \n\t"
|
||||
|
||||
//==============================================================================
|
||||
|
||||
"xvcmpgtdp 2,45,44 \n\t "
|
||||
"xvcmpgtdp 3,47,46 \n\t "
|
||||
"xvcmpgtdp 4,49,48 \n\t "
|
||||
"xvcmpgtdp 5,51,50 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgtdp 2, 1,0 \n\t"
|
||||
"xvcmpgtdp 3,47, 45 \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
//choose bigger from first and second part
|
||||
"xvcmpgtdp 4,5 , 0 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first bigger
|
||||
|
||||
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
|
||||
"xvcmpgtdp 2, 3,39 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
///////extract max value and max index from vector
|
||||
|
||||
"xxspltd 32,38,1 \n\t"
|
||||
"xxspltd 40,39,1 \n\t"
|
||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||
|
||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||
//0b001110=14
|
||||
"bc 14,24, 3f \n\t"
|
||||
"xvcmpgtdp 4, 40,39 \n\t"
|
||||
"xxsel 0,39,40,4 \n\t"
|
||||
"xxsel 1,38,32,4 \n\t"
|
||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||
"b 4f \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
//if elements value are equal then choose minimum index
|
||||
"xxspltd 0,40,0 \n\t"
|
||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||
"xxlor 1,32,32 \n\t"
|
||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||
|
||||
|
||||
"4: \n\t"
|
||||
"mfvsrd %[index],1 \n\t"
|
||||
|
||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||
: [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
|
||||
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
|
||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
|
||||
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
FLOAT maxf = 0.0;
|
||||
BLASLONG max = 0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = diamax_kernel_32(n1, x, &maxf);
|
||||
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = i;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) > maxf) {
|
||||
max = j + 1;
|
||||
maxf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) > maxf) {
|
||||
max = j + 2;
|
||||
maxf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) > maxf) {
|
||||
max = j + 3;
|
||||
maxf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) > maxf) {
|
||||
max = j;
|
||||
maxf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
}
|
||||
384
kernel/power/idamin.c
Normal file
384
kernel/power/idamin.c
Normal file
@@ -0,0 +1,384 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define ABS fabs
|
||||
|
||||
#else
|
||||
|
||||
#define ABS fabsf
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 32 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
BLASLONG index;
|
||||
register __vector long long start = {1,0};
|
||||
register __vector long long temp_add_index = {2, 2};
|
||||
__asm__(
|
||||
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
|
||||
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
|
||||
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
|
||||
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index
|
||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||
"xxspltd 36,36,0 \n\t"
|
||||
"xvabsdp 39, 39 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//jump first half forward
|
||||
"b 2f \n\t"
|
||||
|
||||
//===================================================================
|
||||
|
||||
".p2align 5 \n\t"
|
||||
|
||||
"1: \n\t"
|
||||
"xvcmpgedp 2,44,45 \n\t "
|
||||
"xvcmpgedp 3,46,47 \n\t "
|
||||
"xvcmpgedp 4,48,49 \n\t "
|
||||
"xvcmpgedp 5,50,51 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgedp 2,0, 1 \n\t"
|
||||
"xvcmpgedp 3, 45,47 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
//choose smaller from first and second part
|
||||
"xvcmpgedp 4, 0,5 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
||||
"xvcmpgedp 2,39, 3 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//<-----------jump here from first load
|
||||
"2: \n\t"
|
||||
|
||||
"xvcmpgedp 2,44,45 \n\t "
|
||||
"xvcmpgedp 3,46,47 \n\t "
|
||||
"xvcmpgedp 4,48,49 \n\t "
|
||||
"xvcmpgedp 5,50,51 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgedp 2,0, 1 \n\t"
|
||||
"xvcmpgedp 3, 45,47 \n\t"
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
//choose smaller from first and second part
|
||||
"xvcmpgedp 4, 0,5 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
//load next 64
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
|
||||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
||||
"xvcmpgedp 2,39, 3 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
|
||||
//update index += 8
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//decrement n
|
||||
"addic. %[n], %[n], -32 \n\t"
|
||||
|
||||
//Loop back if >0
|
||||
"bgt+ 1b \n\t"
|
||||
|
||||
//==============================================================================
|
||||
|
||||
"xvcmpgedp 2,44,45 \n\t "
|
||||
"xvcmpgedp 3,46,47 \n\t "
|
||||
"xvcmpgedp 4,48,49 \n\t "
|
||||
"xvcmpgedp 5,50,51 \n\t"
|
||||
|
||||
"xxsel 32,40,41,2 \n\t"
|
||||
"xxsel 0,44,45,2 \n\t"
|
||||
"xxsel 33,42,43,3 \n\t"
|
||||
"xxsel 1,46,47,3 \n\t"
|
||||
"xxsel 34,40,41,4 \n\t"
|
||||
"xxsel 45,48,49,4 \n\t"
|
||||
"xxsel 35,42,43,5 \n\t"
|
||||
"xxsel 47,50,51,5 \n\t"
|
||||
|
||||
"xvcmpgedp 2,0, 1 \n\t"
|
||||
"xvcmpgedp 3, 45,47 \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 0 ,0,1,2 \n\t"
|
||||
"xxsel 34,34,35,3 \n\t"
|
||||
"xxsel 5,45,47,3 \n\t"
|
||||
|
||||
// for {second 8 elements } we have to add 8 to each so that it became {from 8 to 16}
|
||||
"vaddudm 2,2,4 \n\t" // vs34=vs34 + vs36{8,8}
|
||||
//choose smaller from first and second part
|
||||
"xvcmpgedp 4, 0,5 \n\t"
|
||||
"xxsel 3, 0,5,4 \n\t"
|
||||
"xxsel 33,32,34,4 \n\t"
|
||||
|
||||
"vaddudm 1,1,5 \n\t" // get real index for first smaller
|
||||
|
||||
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
|
||||
"xvcmpgedp 2,39, 3 \n\t"
|
||||
"xxsel 39,39,3,2 \n\t"
|
||||
"xxsel 38,38,33,2 \n\t"
|
||||
|
||||
///////extract min value and min index from vector
|
||||
|
||||
"xxspltd 32,38,1 \n\t"
|
||||
"xxspltd 40,39,1 \n\t"
|
||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||
|
||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||
//0b001110=14
|
||||
"bc 14,24, 3f \n\t"
|
||||
"xvcmpgedp 4,39, 40 \n\t"
|
||||
"xxsel 0,39,40,4 \n\t"
|
||||
"xxsel 1,38,32,4 \n\t"
|
||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||
"b 4f \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
//if elements value are equal then choose minimum index
|
||||
"xxspltd 0,40,0 \n\t"
|
||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||
"xxlor 1,32,32 \n\t"
|
||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||
|
||||
|
||||
"4: \n\t"
|
||||
"mfvsrd %[index],1 \n\t"
|
||||
|
||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||
: [mem] "m"(*(const double (*)[n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
|
||||
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
|
||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
|
||||
BLASLONG i = 0;
|
||||
BLASLONG j = 0;
|
||||
BLASLONG min = 0;
|
||||
FLOAT minf = 0.0;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (min);
|
||||
minf = ABS(x[0]); //index's not incremented
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -32;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = diamin_kernel_32(n1, x, &minf);
|
||||
i = n1;
|
||||
}
|
||||
|
||||
while (i < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = i;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
while (j < n1) {
|
||||
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
if (ABS(x[i + inc_x]) < minf) {
|
||||
min = j + 1;
|
||||
minf = ABS(x[i + inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 2 * inc_x]) < minf) {
|
||||
min = j + 2;
|
||||
minf = ABS(x[i + 2 * inc_x]);
|
||||
}
|
||||
if (ABS(x[i + 3 * inc_x]) < minf) {
|
||||
min = j + 3;
|
||||
minf = ABS(x[i + 3 * inc_x]);
|
||||
}
|
||||
|
||||
i += inc_x * 4;
|
||||
|
||||
j += 4;
|
||||
|
||||
}
|
||||
|
||||
|
||||
while (j < n) {
|
||||
if (ABS(x[i]) < minf) {
|
||||
min = j;
|
||||
minf = ABS(x[i]);
|
||||
}
|
||||
i += inc_x;
|
||||
j++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
}
|
||||
362
kernel/power/izamax.c
Normal file
362
kernel/power/izamax.c
Normal file
@@ -0,0 +1,362 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* Find maximum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param maxf (out) maximum absolute value .( only for output )
|
||||
* @return index
|
||||
*/
|
||||
static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
|
||||
|
||||
BLASLONG index;
|
||||
register __vector long long start = {1,0};
|
||||
register __vector long long temp_add_index = {2, 2};
|
||||
__asm__(
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
|
||||
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
|
||||
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_max_index
|
||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
|
||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||
"xxspltd 36,36,0 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//jump first half forward
|
||||
"b 2f \n\t"
|
||||
|
||||
".p2align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvcmpgtdp 50,47,46 \n\t "
|
||||
"xvcmpgtdp 51,49,48 \n\t "
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
"xvcmpgtdp 2,1,0 \n\t "
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgtdp 4,3,39 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
//select with previous
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
|
||||
//>>/////////////////////////////// half start
|
||||
"2: \n\t"
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
"xvcmpgtdp 50,47,46 \n\t "
|
||||
"xvcmpgtdp 51,49,48 \n\t "
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
"xvcmpgtdp 2,1,0 \n\t "
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgtdp 4,3,39 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
//select with previous
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
|
||||
//decrement n
|
||||
"addic. %[n], %[n], -16 \n\t"
|
||||
//Loop back if >0
|
||||
"bgt+ 1b \n\t"
|
||||
|
||||
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvcmpgtdp 50,47,46 \n\t "
|
||||
"xvcmpgtdp 51,49,48 \n\t "
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"xvcmpgtdp 2,1,0 \n\t "
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgtdp 4,3,39 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
///////extract max value and max index from vector
|
||||
|
||||
"xxspltd 32,38,1 \n\t"
|
||||
"xxspltd 40,39,1 \n\t"
|
||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||
|
||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||
//0b001110=14
|
||||
"bc 14,24, 3f \n\t"
|
||||
"xvcmpgtdp 4, 40,39 \n\t"
|
||||
"xxsel 0,39,40,4 \n\t"
|
||||
"xxsel 1,38,32,4 \n\t"
|
||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||
"b 4f \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
//if elements value are equal then choose minimum index
|
||||
"xxspltd 0,40,0 \n\t"
|
||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||
"xxlor 1,32,32 \n\t"
|
||||
"stxsdx 0,0,%[ptr_maxf] \n\t"
|
||||
|
||||
|
||||
"4: \n\t"
|
||||
"mfvsrd %[index],1 \n\t"
|
||||
|
||||
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||
: [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_maxf] "b"(maxf) ,
|
||||
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
|
||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
|
||||
return index;
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix = 0;
|
||||
FLOAT maxf = 0;
|
||||
BLASLONG max = 0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
if (inc_x == 1) {
|
||||
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
max = ziamax_kernel_16(n1, x, &maxf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
maxf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (max + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
361
kernel/power/izamin.c
Normal file
361
kernel/power/izamin.c
Normal file
@@ -0,0 +1,361 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
|
||||
#define ABS fabs
|
||||
#define CABS1(x,i) ABS(x[i])+ABS(x[i+1])
|
||||
|
||||
|
||||
/**
|
||||
* Find minimum index
|
||||
* Warning: requirements n>0 and n % 16 == 0
|
||||
* @param n
|
||||
* @param x pointer to the vector
|
||||
* @param minf (out) minimum absolute value .( only for output )
|
||||
* @return minimum index
|
||||
*/
|
||||
static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
|
||||
|
||||
BLASLONG index;
|
||||
register __vector long long start = {1,0};
|
||||
register __vector long long temp_add_index = {2, 2};
|
||||
__asm__(
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
|
||||
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
|
||||
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
|
||||
"xxlxor 37,37 ,37 \n\t" //v5 v37 index_count
|
||||
"vaddudm 10,9,%[adder] \n\t" //{5,4} vs42
|
||||
"xxlxor 38 ,38 ,38 \n\t" // v6 | vs38 vec_min_index
|
||||
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
|
||||
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
|
||||
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
|
||||
"xxspltd 36,36,0 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
//jump first half forward
|
||||
"b 2f \n\t"
|
||||
|
||||
".p2align 5 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvcmpgedp 50,46,47 \n\t "
|
||||
"xvcmpgedp 51,48,49 \n\t "
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
"xvcmpgedp 2,0,1 \n\t "
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgedp 4,39,3 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
//select with previous
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
|
||||
//>>/////////////////////////////// half start
|
||||
"2: \n\t"
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
"xvcmpgedp 50,46,47 \n\t "
|
||||
"xvcmpgedp 51,48,49 \n\t "
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"lxvd2x 44, 0,%[ptr_tmp] \n\t"
|
||||
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
|
||||
|
||||
"xvcmpgedp 2,0,1 \n\t "
|
||||
"lxvd2x 46, %[i32],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
|
||||
|
||||
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgedp 4,39,3 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
|
||||
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t"
|
||||
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t"
|
||||
//select with previous
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
"xvabsdp 44, 44 \n\t"
|
||||
"xvabsdp 45, 45 \n\t"
|
||||
"xvabsdp 46, 46 \n\t"
|
||||
"xvabsdp 47, 47 \n\t"
|
||||
"xvabsdp 48, 48 \n\t"
|
||||
"xvabsdp 49, 49 \n\t"
|
||||
"xvabsdp 50, 50 \n\t"
|
||||
"xvabsdp 51, 51 \n\t"
|
||||
|
||||
|
||||
//decrement n
|
||||
"addic. %[n], %[n], -16 \n\t"
|
||||
//Loop back if >0
|
||||
"bgt+ 1b \n\t"
|
||||
|
||||
|
||||
"xxmrghd 0,44,45 \n\t"
|
||||
"xxmrgld 1,44,45 \n\t"
|
||||
"xxmrghd 2,46,47 \n\t"
|
||||
"xxmrgld 3,46,47 \n\t"
|
||||
"xxmrghd 4,48,49 \n\t"
|
||||
"xxmrgld 5,48,49 \n\t"
|
||||
"xxmrghd 44,50,51 \n\t"
|
||||
"xxmrgld 45,50,51 \n\t"
|
||||
|
||||
"xvadddp 46, 0,1 \n\t"
|
||||
"xvadddp 47, 2,3 \n\t"
|
||||
"xvadddp 48, 4,5 \n\t"
|
||||
"xvadddp 49, 44,45 \n\t"
|
||||
|
||||
|
||||
|
||||
"xvcmpgedp 50,46,47 \n\t "
|
||||
"xvcmpgedp 51,48,49 \n\t "
|
||||
|
||||
"xxsel 32,40,41,50 \n\t"
|
||||
"xxsel 0,46,47,50 \n\t"
|
||||
"xxsel 33,42,43,51 \n\t"
|
||||
"xxsel 1,48,49,51 \n\t"
|
||||
|
||||
"xvcmpgedp 2,0,1 \n\t "
|
||||
"xxsel 32,32,33,2 \n\t"
|
||||
"xxsel 3,0,1,2 \n\t"
|
||||
|
||||
"vaddudm 0,0,5 \n\t"
|
||||
|
||||
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
|
||||
//cmp with previous
|
||||
|
||||
"xvcmpgedp 4,39,3 \n\t "
|
||||
"vaddudm 5,5,4 \n\t"
|
||||
"xxsel 38,38,32,4 \n\t"
|
||||
"xxsel 39,39,3,4 \n\t"
|
||||
|
||||
|
||||
///////extract min value and min index from vector
|
||||
|
||||
"xxspltd 32,38,1 \n\t"
|
||||
"xxspltd 40,39,1 \n\t"
|
||||
"xvcmpeqdp. 2, 40,39 \n\t"
|
||||
|
||||
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
|
||||
//0b001110=14
|
||||
"bc 14,24, 3f \n\t"
|
||||
"xvcmpgedp 4,39, 40 \n\t"
|
||||
"xxsel 0,39,40,4 \n\t"
|
||||
"xxsel 1,38,32,4 \n\t"
|
||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||
"b 4f \n\t"
|
||||
|
||||
"3: \n\t"
|
||||
//if elements value are equal then choose minimum index
|
||||
"xxspltd 0,40,0 \n\t"
|
||||
"vminud 0,0,6 \n\t" //vs32 vs38
|
||||
"xxlor 1,32,32 \n\t"
|
||||
"stxsdx 0,0,%[ptr_minf] \n\t"
|
||||
|
||||
|
||||
"4: \n\t"
|
||||
"mfvsrd %[index],1 \n\t"
|
||||
|
||||
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
|
||||
: [mem] "m"(*(const double (*)[2*n])x), [ptr_x] "b"(x), [ptr_minf] "b"(minf) ,
|
||||
[i16] "b"(16), [i32] "b"(32), [i48] "b"(48),
|
||||
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
|
||||
[start] "v"(start), [adder] "v"(temp_add_index)
|
||||
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
|
||||
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
|
||||
);
|
||||
|
||||
return index;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf;
|
||||
BLASLONG min=0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
|
||||
if (inc_x == 1) {
|
||||
minf = CABS1(x,0); //index will not be incremented
|
||||
BLASLONG n1 = n & -16;
|
||||
if (n1 > 0) {
|
||||
|
||||
min = ziamin_kernel_16_TUNED(n1, x, &minf);
|
||||
i = n1;
|
||||
ix = n1 << 1;
|
||||
}
|
||||
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += 2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
|
||||
} else {
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return (min + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "sgemm_tcopy_macros_16_power8.S"
|
||||
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
@@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
addi r11 ,SP, 288
|
||||
stvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v21, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v22, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v23, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v24, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v25, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v26, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v27, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v28, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v29, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v30, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v31, r11, r0
|
||||
li r11, 0
|
||||
std r14, 0(SP)
|
||||
std r15, 8(SP)
|
||||
std r16, 16(SP)
|
||||
std r17, 24(SP)
|
||||
std r18, 32(SP)
|
||||
std r19, 40(SP)
|
||||
std r20, 48(SP)
|
||||
std r21, 56(SP)
|
||||
std r22, 64(SP)
|
||||
std r23, 72(SP)
|
||||
std r24, 80(SP)
|
||||
std r25, 88(SP)
|
||||
std r26, 96(SP)
|
||||
std r27, 104(SP)
|
||||
std r28, 112(SP)
|
||||
std r29, 120(SP)
|
||||
std r30, 128(SP)
|
||||
std r31, 136(SP)
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
@@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11, SP, 288
|
||||
lvx v20, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v21, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v22, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v23, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v24, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v25, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v26, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v27, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v28, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v29, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v30, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v31, r11, r3
|
||||
li r11, 0
|
||||
ld r14, 0(SP)
|
||||
ld r15, 8(SP)
|
||||
ld r16, 16(SP)
|
||||
ld r17, 24(SP)
|
||||
ld r18, 32(SP)
|
||||
ld r19, 40(SP)
|
||||
ld r20, 48(SP)
|
||||
ld r21, 56(SP)
|
||||
ld r22, 64(SP)
|
||||
ld r23, 72(SP)
|
||||
ld r24, 80(SP)
|
||||
ld r25, 88(SP)
|
||||
ld r26, 96(SP)
|
||||
ld r27, 104(SP)
|
||||
ld r28, 112(SP)
|
||||
ld r29, 120(SP)
|
||||
ld r30, 128(SP)
|
||||
ld r31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
@@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "sgemm_tcopy_macros_8_power8.S"
|
||||
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
std r14, 0(SP)
|
||||
std r15, 8(SP)
|
||||
std r16, 16(SP)
|
||||
std r17, 24(SP)
|
||||
std r18, 32(SP)
|
||||
std r19, 40(SP)
|
||||
std r20, 48(SP)
|
||||
std r21, 56(SP)
|
||||
std r22, 64(SP)
|
||||
std r23, 72(SP)
|
||||
std r24, 80(SP)
|
||||
std r25, 88(SP)
|
||||
std r26, 96(SP)
|
||||
std r27, 104(SP)
|
||||
std r28, 112(SP)
|
||||
std r29, 120(SP)
|
||||
std r30, 128(SP)
|
||||
std r31, 136(SP)
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
addi r11, SP, 288
|
||||
stvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v21, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v22, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v23, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v24, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v25, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v26, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v27, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v28, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v29, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v30, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v31, r11, r0
|
||||
li r11, 0
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
@@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11,SP,288
|
||||
lvx v20, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v21, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v22, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v23, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v24, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v25, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v26, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v27, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v28, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v29, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v30, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v31, r11, r3
|
||||
li r11, 0
|
||||
ld r14, 0(SP)
|
||||
ld r15, 8(SP)
|
||||
ld r16, 16(SP)
|
||||
ld r17, 24(SP)
|
||||
ld r18, 32(SP)
|
||||
ld r19, 40(SP)
|
||||
ld r20, 48(SP)
|
||||
ld r21, 56(SP)
|
||||
ld r22, 64(SP)
|
||||
ld r23, 72(SP)
|
||||
ld r24, 80(SP)
|
||||
ld r25, 88(SP)
|
||||
ld r26, 96(SP)
|
||||
ld r27, 104(SP)
|
||||
ld r28, 112(SP)
|
||||
ld r29, 120(SP)
|
||||
ld r30, 128(SP)
|
||||
ld r31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
@@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#include "zgemm_tcopy_macros_8_power8.S"
|
||||
|
||||
#define STACKSIZE 384
|
||||
#define STACKSIZE 576
|
||||
#define STACKSIZE 144
|
||||
|
||||
|
||||
|
||||
PROLOGUE
|
||||
@@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
addi SP, SP, -STACKSIZE
|
||||
li r0, 0
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
addi r11, SP ,288
|
||||
stvx v20, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v21, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v22, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v23, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v24, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v25, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v26, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v27, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v28, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v29, r11, r0
|
||||
addi r11, r11, 16
|
||||
stvx v30, r11, r0
|
||||
addi r11, r11 ,16
|
||||
stvx v31, r11, r0
|
||||
li r11,0
|
||||
std r14, 0(SP)
|
||||
std r15, 8(SP)
|
||||
std r16, 16(SP)
|
||||
std r17, 24(SP)
|
||||
std r18, 32(SP)
|
||||
std r19, 40(SP)
|
||||
std r20, 48(SP)
|
||||
std r21, 56(SP)
|
||||
std r22, 64(SP)
|
||||
std r23, 72(SP)
|
||||
std r24, 80(SP)
|
||||
std r25, 88(SP)
|
||||
std r26, 96(SP)
|
||||
std r27, 104(SP)
|
||||
std r28, 112(SP)
|
||||
std r29, 120(SP)
|
||||
std r30, 128(SP)
|
||||
std r31, 136(SP)
|
||||
|
||||
|
||||
cmpwi cr0, M, 0
|
||||
ble- L999
|
||||
@@ -204,49 +180,24 @@ L999:
|
||||
|
||||
li r3, 0
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
addi r11, SP, 288
|
||||
lvx v20, r11,r3
|
||||
addi r11, r11, 16
|
||||
lvx v21, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v22, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v23, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v24, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v25, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v26, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v27, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v28, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v29, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v30, r11, r3
|
||||
addi r11, r11, 16
|
||||
lvx v31, r11, r3
|
||||
li r11,0
|
||||
ld r14, 0(SP)
|
||||
ld r15, 8(SP)
|
||||
ld r16, 16(SP)
|
||||
ld r17, 24(SP)
|
||||
ld r18, 32(SP)
|
||||
ld r19, 40(SP)
|
||||
ld r20, 48(SP)
|
||||
ld r21, 56(SP)
|
||||
ld r22, 64(SP)
|
||||
ld r23, 72(SP)
|
||||
ld r24, 80(SP)
|
||||
ld r25, 88(SP)
|
||||
ld r26, 96(SP)
|
||||
ld r27, 104(SP)
|
||||
ld r28, 112(SP)
|
||||
ld r29, 120(SP)
|
||||
ld r30, 128(SP)
|
||||
ld r31, 136(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
@@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
lxvd2x vs51, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
lxvd2x vs52, o0, A2
|
||||
lxvd2x vs53, o16, A2
|
||||
lxvd2x vs54, o32, A2
|
||||
lxvd2x vs55, o48, A2
|
||||
lxvd2x vs2, o0, A2
|
||||
lxvd2x vs3, o16, A2
|
||||
lxvd2x vs4, o32, A2
|
||||
lxvd2x vs5, o48, A2
|
||||
addi A2, A2, 64
|
||||
|
||||
|
||||
lxvd2x vs56, o0, A3
|
||||
lxvd2x vs57, o16, A3
|
||||
lxvd2x vs58, o32, A3
|
||||
lxvd2x vs59, o48, A3
|
||||
lxvd2x vs6, o0, A3
|
||||
lxvd2x vs7, o16, A3
|
||||
lxvd2x vs8, o32, A3
|
||||
lxvd2x vs9, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
lxvd2x vs60, o0, A3
|
||||
lxvd2x vs61, o16, A3
|
||||
lxvd2x vs62, o32, A3
|
||||
lxvd2x vs63, o48, A3
|
||||
lxvd2x vs10, o0, A3
|
||||
lxvd2x vs11, o16, A3
|
||||
lxvd2x vs12, o32, A3
|
||||
lxvd2x vs13, o48, A3
|
||||
addi A3, A3, 64
|
||||
|
||||
|
||||
@@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stxvd2x vs51, o48, T1
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs52, o0, T1
|
||||
stxvd2x vs53, o16, T1
|
||||
stxvd2x vs54, o32, T1
|
||||
stxvd2x vs55, o48, T1
|
||||
stxvd2x vs2, o0, T1
|
||||
stxvd2x vs3, o16, T1
|
||||
stxvd2x vs4, o32, T1
|
||||
stxvd2x vs5, o48, T1
|
||||
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs56, o0, T1
|
||||
stxvd2x vs57, o16, T1
|
||||
stxvd2x vs58, o32, T1
|
||||
stxvd2x vs59, o48, T1
|
||||
stxvd2x vs6, o0, T1
|
||||
stxvd2x vs7, o16, T1
|
||||
stxvd2x vs8, o32, T1
|
||||
stxvd2x vs9, o48, T1
|
||||
addi T1, T1, 64
|
||||
|
||||
stxvd2x vs60, o0, T1
|
||||
stxvd2x vs61, o16, T1
|
||||
stxvd2x vs62, o32, T1
|
||||
stxvd2x vs63, o48, T1
|
||||
stxvd2x vs10, o0, T1
|
||||
stxvd2x vs11, o16, T1
|
||||
stxvd2x vs12, o32, T1
|
||||
stxvd2x vs13, o48, T1
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
958
kernel/power/zgemv_n_4.c
Normal file
958
kernel/power/zgemv_n_4.c
Normal file
@@ -0,0 +1,958 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
#define HAVE_KERNEL_ADDY 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
//
|
||||
#define NBMAX 4096
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4_VEC_ASM
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register __vector double vx0_r = {x[0], x[0]};
|
||||
register __vector double vx0_i = {-x[1], x[1]};
|
||||
register __vector double vx1_r = {x[2], x[2]};
|
||||
register __vector double vx1_i = {-x[3], x[3]};
|
||||
register __vector double vx2_r = {x[4], x[4]};
|
||||
register __vector double vx2_i = {-x[5], x[5]};
|
||||
register __vector double vx3_r = {x[6], x[6]};
|
||||
register __vector double vx3_i = {-x[7], x[7]};
|
||||
|
||||
#else
|
||||
register __vector double vx0_r = {x[0], -x[0]};
|
||||
register __vector double vx0_i = {x[1], x[1]};
|
||||
register __vector double vx1_r = {x[2], -x[2]};
|
||||
register __vector double vx1_i = {x[3], x[3]};
|
||||
register __vector double vx2_r = {x[4], -x[4]};
|
||||
register __vector double vx2_i = {x[5], x[5]};
|
||||
register __vector double vx3_r = {x[6], -x[6]};
|
||||
register __vector double vx3_i = {x[7], x[7]};
|
||||
#endif
|
||||
|
||||
register __vector double *vy = (__vector double *) y;
|
||||
register __vector double *vptr_a0 = (__vector double *) a0;
|
||||
register __vector double *vptr_a1 = (__vector double *) a1;
|
||||
register __vector double *vptr_a2 = (__vector double *) a2;
|
||||
register __vector double *vptr_a3 = (__vector double *) a3;
|
||||
|
||||
|
||||
register __vector double vy_0;
|
||||
register __vector double va0;
|
||||
register __vector double va1;
|
||||
register __vector double va2;
|
||||
register __vector double va3;
|
||||
register __vector double vy_1;
|
||||
register __vector double va0_1;
|
||||
register __vector double va1_1;
|
||||
register __vector double va2_1;
|
||||
register __vector double va3_1;
|
||||
register __vector double vy_2;
|
||||
register __vector double va0_2;
|
||||
register __vector double va1_2;
|
||||
register __vector double va2_2;
|
||||
register __vector double va3_2;
|
||||
register __vector double vy_3;
|
||||
register __vector double va0_3;
|
||||
register __vector double va1_3;
|
||||
register __vector double va2_3;
|
||||
register __vector double va3_3;
|
||||
|
||||
BLASLONG i = 0;
|
||||
while (i < n) {
|
||||
|
||||
vy_0 = vy[i];
|
||||
va0 = vptr_a0[i];
|
||||
va1 = vptr_a1[i];
|
||||
va2 = vptr_a2[i];
|
||||
va3 = vptr_a3[i];
|
||||
|
||||
vy_1 = vy[i + 1];
|
||||
va0_1 = vptr_a0[i + 1];
|
||||
va1_1 = vptr_a1[i + 1];
|
||||
va2_1 = vptr_a2[i + 1];
|
||||
va3_1 = vptr_a3[i + 1];
|
||||
|
||||
vy_2 = vy[i + 2];
|
||||
va0_2 = vptr_a0[i + 2];
|
||||
va1_2 = vptr_a1[i + 2];
|
||||
va2_2 = vptr_a2[i + 2];
|
||||
va3_2 = vptr_a3[i + 2];
|
||||
|
||||
vy_3 = vy[i + 3];
|
||||
va0_3 = vptr_a0[i + 3];
|
||||
va1_3 = vptr_a1[i + 3];
|
||||
va2_3 = vptr_a2[i + 3];
|
||||
va3_3 = vptr_a3[i + 3];
|
||||
|
||||
vy_0 += va0*vx0_r;
|
||||
vy_1 += va0_1*vx0_r;
|
||||
vy_2 += va0_2*vx0_r;
|
||||
vy_3 += va0_3*vx0_r;
|
||||
|
||||
|
||||
vy_0 += va1*vx1_r;
|
||||
vy_1 += va1_1*vx1_r;
|
||||
vy_2 += va1_2*vx1_r;
|
||||
vy_3 += va1_3*vx1_r;
|
||||
|
||||
va0 = vec_xxpermdi(va0, va0, 2);
|
||||
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
|
||||
|
||||
|
||||
vy_0 += va2*vx2_r;
|
||||
vy_1 += va2_1*vx2_r;
|
||||
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
|
||||
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
|
||||
vy_2 += va2_2*vx2_r;
|
||||
vy_3 += va2_3*vx2_r;
|
||||
|
||||
va1 = vec_xxpermdi(va1, va1, 2);
|
||||
va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
|
||||
|
||||
|
||||
vy_0 += va3*vx3_r;
|
||||
vy_1 += va3_1*vx3_r;
|
||||
|
||||
va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
|
||||
va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
|
||||
|
||||
vy_2 += va3_2*vx3_r;
|
||||
vy_3 += va3_3*vx3_r;
|
||||
|
||||
va2 = vec_xxpermdi(va2, va2, 2);
|
||||
va2_1 = vec_xxpermdi(va2_1, va2_1, 2);
|
||||
|
||||
|
||||
vy_0 += va0*vx0_i;
|
||||
vy_1 += va0_1*vx0_i;
|
||||
|
||||
va2_2 = vec_xxpermdi(va2_2, va2_2, 2);
|
||||
va2_3 = vec_xxpermdi(va2_3, va2_3, 2);
|
||||
|
||||
vy_2 += va0_2*vx0_i;
|
||||
vy_3 += va0_3*vx0_i;
|
||||
|
||||
va3 = vec_xxpermdi(va3, va3, 2);
|
||||
va3_1 = vec_xxpermdi(va3_1, va3_1, 2);
|
||||
|
||||
|
||||
vy_0 += va1*vx1_i;
|
||||
vy_1 += va1_1*vx1_i;
|
||||
|
||||
va3_2 = vec_xxpermdi(va3_2, va3_2, 2);
|
||||
va3_3 = vec_xxpermdi(va3_3, va3_3, 2);
|
||||
|
||||
vy_2 += va1_2*vx1_i;
|
||||
vy_3 += va1_3*vx1_i;
|
||||
|
||||
vy_0 += va2*vx2_i;
|
||||
vy_1 += va2_1*vx2_i;
|
||||
vy_2 += va2_2*vx2_i;
|
||||
vy_3 += va2_3*vx2_i;
|
||||
|
||||
vy_0 += va3*vx3_i;
|
||||
vy_1 += va3_1*vx3_i;
|
||||
vy_2 += va3_2*vx3_i;
|
||||
vy_3 += va3_3*vx3_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
|
||||
|
||||
i += 4;
|
||||
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
|
||||
y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
|
||||
y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
|
||||
y[i] += a2[i] * x[4] - a2[i + 1] * x[5];
|
||||
y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4];
|
||||
y[i] += a3[i] * x[6] - a3[i + 1] * x[7];
|
||||
y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6];
|
||||
#else
|
||||
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
|
||||
y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
|
||||
y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
|
||||
y[i] += a2[i] * x[4] + a2[i + 1] * x[5];
|
||||
y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4];
|
||||
y[i] += a3[i] * x[6] + a3[i + 1] * x[7];
|
||||
y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2_VEC
|
||||
|
||||
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register __vector double vx0_r = {x[0], x[0]};
|
||||
register __vector double vx0_i = {-x[1], x[1]};
|
||||
register __vector double vx1_r = {x[2], x[2]};
|
||||
register __vector double vx1_i = {-x[3], x[3]};
|
||||
|
||||
#else
|
||||
register __vector double vx0_r = {x[0], -x[0]};
|
||||
register __vector double vx0_i = {x[1], x[1]};
|
||||
register __vector double vx1_r = {x[2], -x[2]};
|
||||
register __vector double vx1_i = {x[3], x[3]};
|
||||
#endif
|
||||
|
||||
|
||||
register __vector double *vy = (__vector double *) y;
|
||||
register __vector double *vptr_a0 = (__vector double *) a0;
|
||||
register __vector double *vptr_a1 = (__vector double *) a1;
|
||||
|
||||
for (i = 0; i < n; i += 4) {
|
||||
|
||||
register __vector double vy_0 = vy[i];
|
||||
register __vector double vy_1 = vy[i + 1];
|
||||
register __vector double vy_2 = vy[i + 2];
|
||||
register __vector double vy_3 = vy[i + 3];
|
||||
|
||||
register __vector double va0 = vptr_a0[i];
|
||||
register __vector double va0_1 = vptr_a0[i + 1];
|
||||
register __vector double va0_2 = vptr_a0[i + 2];
|
||||
register __vector double va0_3 = vptr_a0[i + 3];
|
||||
|
||||
register __vector double va1 = vptr_a1[i];
|
||||
register __vector double va1_1 = vptr_a1[i + 1];
|
||||
register __vector double va1_2 = vptr_a1[i + 2];
|
||||
register __vector double va1_3 = vptr_a1[i + 3];
|
||||
|
||||
vy_0 += va0*vx0_r;
|
||||
vy_1 += va0_1*vx0_r;
|
||||
vy_2 += va0_2*vx0_r;
|
||||
vy_3 += va0_3*vx0_r;
|
||||
|
||||
va0 = vec_xxpermdi(va0, va0, 2);
|
||||
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
|
||||
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
|
||||
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
|
||||
|
||||
vy_0 += va1*vx1_r;
|
||||
vy_1 += va1_1*vx1_r;
|
||||
vy_2 += va1_2*vx1_r;
|
||||
vy_3 += va1_3*vx1_r;
|
||||
|
||||
va1 = vec_xxpermdi(va1, va1, 2);
|
||||
va1_1 = vec_xxpermdi(va1_1, va1_1, 2);
|
||||
va1_2 = vec_xxpermdi(va1_2, va1_2, 2);
|
||||
va1_3 = vec_xxpermdi(va1_3, va1_3, 2);
|
||||
|
||||
vy_0 += va0*vx0_i;
|
||||
vy_1 += va0_1*vx0_i;
|
||||
vy_2 += va0_2*vx0_i;
|
||||
vy_3 += va0_3*vx0_i;
|
||||
|
||||
vy_0 += va1*vx1_i;
|
||||
vy_1 += va1_1*vx1_i;
|
||||
vy_2 += va1_2*vx1_i;
|
||||
vy_3 += va1_3*vx1_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
|
||||
y[i] += a1[i] * x[2] - a1[i + 1] * x[3];
|
||||
y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2];
|
||||
#else
|
||||
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
|
||||
y[i] += a1[i] * x[2] + a1[i + 1] * x[3];
|
||||
y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2];
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1_VEC
|
||||
|
||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register __vector double vx0_r = {x[0], x[0]};
|
||||
register __vector double vx0_i = {-x[1], x[1]};
|
||||
|
||||
#else
|
||||
register __vector double vx0_r = {x[0], -x[0]};
|
||||
register __vector double vx0_i = {x[1], x[1]};
|
||||
#endif
|
||||
|
||||
|
||||
register __vector double *vy = (__vector double *) y;
|
||||
register __vector double *vptr_a0 = (__vector double *) a0;
|
||||
|
||||
for (i = 0; i < n; i += 4) {
|
||||
|
||||
register __vector double vy_0 = vy[i];
|
||||
register __vector double vy_1 = vy[i + 1];
|
||||
register __vector double vy_2 = vy[i + 2];
|
||||
register __vector double vy_3 = vy[i + 3];
|
||||
|
||||
register __vector double va0 = vptr_a0[i];
|
||||
register __vector double va0_1 = vptr_a0[i + 1];
|
||||
register __vector double va0_2 = vptr_a0[i + 2];
|
||||
register __vector double va0_3 = vptr_a0[i + 3];
|
||||
|
||||
vy_0 += va0*vx0_r;
|
||||
vy_1 += va0_1*vx0_r;
|
||||
vy_2 += va0_2*vx0_r;
|
||||
vy_3 += va0_3*vx0_r;
|
||||
|
||||
va0 = vec_xxpermdi(va0, va0, 2);
|
||||
va0_1 = vec_xxpermdi(va0_1, va0_1, 2);
|
||||
va0_2 = vec_xxpermdi(va0_2, va0_2, 2);
|
||||
va0_3 = vec_xxpermdi(va0_3, va0_3, 2);
|
||||
|
||||
vy_0 += va0*vx0_i;
|
||||
vy_1 += va0_1*vx0_i;
|
||||
vy_2 += va0_2*vx0_i;
|
||||
vy_3 += va0_3*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
y[i] += a0[i] * x[0] - a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0];
|
||||
#else
|
||||
y[i] += a0[i] * x[0] + a0[i + 1] * x[1];
|
||||
y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0];
|
||||
#endif
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_ADDY
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
register __vector double valpha_r = {alpha_r, alpha_r};
|
||||
register __vector double valpha_i = {-alpha_i, alpha_i};
|
||||
|
||||
#else
|
||||
register __vector double valpha_r = {alpha_r, -alpha_r};
|
||||
register __vector double valpha_i = {alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
register __vector double *vptr_src = (__vector double *) src;
|
||||
if (inc_dest != 2) {
|
||||
register __vector double *vptr_y = (__vector double *) dest;
|
||||
//note that inc_dest is already 2x. so we should add it to double*
|
||||
register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest);
|
||||
register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest);
|
||||
register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest);
|
||||
BLASLONG dest_t = 0;
|
||||
BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times
|
||||
for (i = 0; i < n; i += 4) {
|
||||
|
||||
register __vector double vy_0 = vptr_y[dest_t];
|
||||
register __vector double vy_1 = vptr_y1[dest_t];
|
||||
register __vector double vy_2 = vptr_y2[dest_t];
|
||||
register __vector double vy_3 = vptr_y3[dest_t];
|
||||
|
||||
register __vector double vsrc = vptr_src[i];
|
||||
register __vector double vsrc_1 = vptr_src[i + 1];
|
||||
register __vector double vsrc_2 = vptr_src[i + 2];
|
||||
register __vector double vsrc_3 = vptr_src[i + 3];
|
||||
|
||||
vy_0 += vsrc*valpha_r;
|
||||
vy_1 += vsrc_1*valpha_r;
|
||||
vy_2 += vsrc_2*valpha_r;
|
||||
vy_3 += vsrc_3*valpha_r;
|
||||
|
||||
vsrc = vec_xxpermdi(vsrc, vsrc, 2);
|
||||
vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
|
||||
vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
|
||||
vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
|
||||
|
||||
vy_0 += vsrc*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_i;
|
||||
vy_2 += vsrc_2*valpha_i;
|
||||
vy_3 += vsrc_3*valpha_i;
|
||||
|
||||
vptr_y[dest_t] = vy_0;
|
||||
vptr_y1[dest_t ] = vy_1;
|
||||
vptr_y2[dest_t] = vy_2;
|
||||
vptr_y3[dest_t] = vy_3;
|
||||
|
||||
dest_t += add_dest;
|
||||
|
||||
}
|
||||
|
||||
return;
|
||||
} else {
|
||||
register __vector double *vptr_y = (__vector double *) dest;
|
||||
for (i = 0; i < n; i += 4) {
|
||||
|
||||
register __vector double vy_0 = vptr_y[i];
|
||||
register __vector double vy_1 = vptr_y[i + 1];
|
||||
register __vector double vy_2 = vptr_y[i + 2];
|
||||
register __vector double vy_3 = vptr_y[i + 3];
|
||||
|
||||
register __vector double vsrc = vptr_src[i];
|
||||
register __vector double vsrc_1 = vptr_src[i + 1];
|
||||
register __vector double vsrc_2 = vptr_src[i + 2];
|
||||
register __vector double vsrc_3 = vptr_src[i + 3];
|
||||
|
||||
vy_0 += vsrc*valpha_r;
|
||||
vy_1 += vsrc_1*valpha_r;
|
||||
vy_2 += vsrc_2*valpha_r;
|
||||
vy_3 += vsrc_3*valpha_r;
|
||||
|
||||
vsrc = vec_xxpermdi(vsrc, vsrc, 2);
|
||||
vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2);
|
||||
vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2);
|
||||
vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2);
|
||||
|
||||
vy_0 += vsrc*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_i;
|
||||
vy_2 += vsrc_2*valpha_i;
|
||||
vy_3 += vsrc_3*valpha_i;
|
||||
|
||||
vptr_y[i] = vy_0;
|
||||
vptr_y[i + 1 ] = vy_1;
|
||||
vptr_y[i + 2] = vy_2;
|
||||
vptr_y[i + 3] = vy_3;
|
||||
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
if (inc_dest != 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
for (i = 0; i < n; i++) {
|
||||
#if !defined(XCONJ)
|
||||
temp_r = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i = alpha_r * src[1] + alpha_i * src[0];
|
||||
#else
|
||||
temp_r = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i = -alpha_r * src[1] + alpha_i * src[0];
|
||||
#endif
|
||||
|
||||
*dest += temp_r;
|
||||
*(dest + 1) += temp_i;
|
||||
|
||||
src += 2;
|
||||
dest += inc_dest;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
FLOAT temp_r0;
|
||||
FLOAT temp_i0;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT temp_r2;
|
||||
FLOAT temp_i2;
|
||||
FLOAT temp_r3;
|
||||
FLOAT temp_i3;
|
||||
for (i = 0; i < n; i += 4) {
|
||||
#if !defined(XCONJ)
|
||||
temp_r0 = alpha_r * src[0] - alpha_i * src[1];
|
||||
temp_i0 = alpha_r * src[1] + alpha_i * src[0];
|
||||
temp_r1 = alpha_r * src[2] - alpha_i * src[3];
|
||||
temp_i1 = alpha_r * src[3] + alpha_i * src[2];
|
||||
temp_r2 = alpha_r * src[4] - alpha_i * src[5];
|
||||
temp_i2 = alpha_r * src[5] + alpha_i * src[4];
|
||||
temp_r3 = alpha_r * src[6] - alpha_i * src[7];
|
||||
temp_i3 = alpha_r * src[7] + alpha_i * src[6];
|
||||
#else
|
||||
temp_r0 = alpha_r * src[0] + alpha_i * src[1];
|
||||
temp_i0 = -alpha_r * src[1] + alpha_i * src[0];
|
||||
temp_r1 = alpha_r * src[2] + alpha_i * src[3];
|
||||
temp_i1 = -alpha_r * src[3] + alpha_i * src[2];
|
||||
temp_r2 = alpha_r * src[4] + alpha_i * src[5];
|
||||
temp_i2 = -alpha_r * src[5] + alpha_i * src[4];
|
||||
temp_r3 = alpha_r * src[6] + alpha_i * src[7];
|
||||
temp_i3 = -alpha_r * src[7] + alpha_i * src[6];
|
||||
#endif
|
||||
|
||||
dest[0] += temp_r0;
|
||||
dest[1] += temp_i0;
|
||||
dest[2] += temp_r1;
|
||||
dest[3] += temp_i1;
|
||||
dest[4] += temp_r2;
|
||||
dest[5] += temp_i2;
|
||||
dest[6] += temp_r3;
|
||||
dest[7] += temp_i3;
|
||||
|
||||
src += 8;
|
||||
dest += 8;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
ybuffer = buffer;
|
||||
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
lda *= 2;
|
||||
|
||||
n1 = n / 4;
|
||||
n2 = n % 4;
|
||||
|
||||
m3 = m % 4;
|
||||
m1 = m - (m % 4);
|
||||
m2 = (m % NBMAX) - (m % 4);
|
||||
|
||||
y_ptr = y;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
a_ptr = a;
|
||||
|
||||
x_ptr = x;
|
||||
//zero_y(NB,ybuffer);
|
||||
memset(ybuffer, 0, NB * 16);
|
||||
|
||||
if (inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
x_ptr += 8;
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 4;
|
||||
a_ptr += 2 * lda;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer);
|
||||
x_ptr += 2;
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[2] = x_ptr[0];
|
||||
xbuffer[3] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[4] = x_ptr[0];
|
||||
xbuffer[5] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
xbuffer[6] = x_ptr[0];
|
||||
xbuffer[7] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
|
||||
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
xbuffer[0] = x_ptr[0];
|
||||
xbuffer[1] = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer);
|
||||
a_ptr += lda;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i);
|
||||
a += 2 * NB;
|
||||
y_ptr += NB * inc_y;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
if (m3 == 1) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r = 0.0;
|
||||
FLOAT temp_i = 0.0;
|
||||
|
||||
if (lda == 2 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3];
|
||||
temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 2;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
#else
|
||||
temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
if (lda == 4 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < (n & -2); i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
|
||||
temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3];
|
||||
temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2];
|
||||
temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3];
|
||||
temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2];
|
||||
#endif
|
||||
|
||||
a_ptr += 8;
|
||||
x_ptr += 4;
|
||||
}
|
||||
|
||||
for (; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 4;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 3) {
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
|
||||
if (lda == 6 && inc_x == 2) {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += 6;
|
||||
x_ptr += 2;
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n; i++) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0];
|
||||
#else
|
||||
temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1];
|
||||
temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0];
|
||||
temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1];
|
||||
temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0];
|
||||
temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1];
|
||||
temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0];
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
x_ptr += inc_x;
|
||||
}
|
||||
|
||||
}
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
#endif
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
847
kernel/power/zgemv_t_4.c
Normal file
847
kernel/power/zgemv_t_4.c
Normal file
@@ -0,0 +1,847 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#define NBMAX 4096
|
||||
#define HAVE_KERNEL_4x4_VEC 1
|
||||
#define HAVE_KERNEL_4x2_VEC 1
|
||||
#define HAVE_KERNEL_4x1_VEC 1
|
||||
|
||||
#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC)
|
||||
#include <altivec.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x4_VEC_ASM
|
||||
|
||||
#elif HAVE_KERNEL_4x4_VEC
|
||||
|
||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
//p for positive(real*real,image*image) r for image (real*image,image*real)
|
||||
register __vector double vtemp0_p = {0.0, 0.0};
|
||||
register __vector double vtemp0_r = {0.0, 0.0};
|
||||
register __vector double vtemp1_p = {0.0, 0.0};
|
||||
register __vector double vtemp1_r = {0.0, 0.0};
|
||||
register __vector double vtemp2_p = {0.0, 0.0};
|
||||
register __vector double vtemp2_r = {0.0, 0.0};
|
||||
register __vector double vtemp3_p = {0.0, 0.0};
|
||||
register __vector double vtemp3_r = {0.0, 0.0};
|
||||
i = 0;
|
||||
n = n << 1;
|
||||
while (i < n) {
|
||||
// __builtin_prefetch(&x[i]);
|
||||
// __builtin_prefetch(&a0[i]);
|
||||
// __builtin_prefetch(&a1[i]);
|
||||
// __builtin_prefetch(&a2[i]);
|
||||
// __builtin_prefetch(&a3[i]);
|
||||
register __vector double vx_0 = *(__vector double*) (&x[i]);
|
||||
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
|
||||
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
|
||||
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
|
||||
|
||||
register __vector double va0 = *(__vector double*) (&a0[i]);
|
||||
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
|
||||
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
|
||||
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
|
||||
|
||||
register __vector double va1 = *(__vector double*) (&a1[i]);
|
||||
register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
|
||||
register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
|
||||
register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
|
||||
|
||||
register __vector double va2 = *(__vector double*) (&a2[i]);
|
||||
register __vector double va2_1 = *(__vector double*) (&a2[i + 2]);
|
||||
register __vector double va2_2 = *(__vector double*) (&a2[i + 4]);
|
||||
register __vector double va2_3 = *(__vector double*) (&a2[i + 6]);
|
||||
|
||||
register __vector double va3 = *(__vector double*) (&a3[i]);
|
||||
register __vector double va3_1 = *(__vector double*) (&a3[i + 2]);
|
||||
register __vector double va3_2 = *(__vector double*) (&a3[i + 4]);
|
||||
register __vector double va3_3 = *(__vector double*) (&a3[i + 6]);
|
||||
|
||||
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
|
||||
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
|
||||
|
||||
i += 8;
|
||||
|
||||
vtemp0_p += vx_0*va0;
|
||||
vtemp0_r += vxr_0*va0;
|
||||
|
||||
vtemp1_p += vx_0*va1;
|
||||
vtemp1_r += vxr_0*va1;
|
||||
|
||||
vtemp2_p += vx_0*va2;
|
||||
vtemp2_r += vxr_0*va2;
|
||||
|
||||
vtemp3_p += vx_0*va3;
|
||||
vtemp3_r += vxr_0*va3;
|
||||
|
||||
vtemp0_p += vx_1*va0_1;
|
||||
vtemp0_r += vxr_1*va0_1;
|
||||
|
||||
vtemp1_p += vx_1*va1_1;
|
||||
vtemp1_r += vxr_1*va1_1;
|
||||
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
|
||||
vtemp2_p += vx_1*va2_1;
|
||||
vtemp2_r += vxr_1*va2_1;
|
||||
|
||||
vtemp3_p += vx_1*va3_1;
|
||||
vtemp3_r += vxr_1*va3_1;
|
||||
|
||||
vtemp0_p += vx_2*va0_2;
|
||||
vtemp0_r += vxr_0*va0_2;
|
||||
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
|
||||
|
||||
vtemp1_p += vx_2*va1_2;
|
||||
vtemp1_r += vxr_0*va1_2;
|
||||
|
||||
vtemp2_p += vx_2*va2_2;
|
||||
vtemp2_r += vxr_0*va2_2;
|
||||
|
||||
vtemp3_p += vx_2*va3_2;
|
||||
vtemp3_r += vxr_0*va3_2;
|
||||
|
||||
vtemp0_p += vx_3*va0_3;
|
||||
vtemp0_r += vxr_1*va0_3;
|
||||
|
||||
vtemp1_p += vx_3*va1_3;
|
||||
vtemp1_r += vxr_1*va1_3;
|
||||
|
||||
vtemp2_p += vx_3*va2_3;
|
||||
vtemp2_r += vxr_1*va2_3;
|
||||
|
||||
vtemp3_p += vx_3*va3_3;
|
||||
vtemp3_r += vxr_1*va3_3;
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
|
||||
|
||||
register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1];
|
||||
register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1];
|
||||
|
||||
register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1];
|
||||
register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
|
||||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
|
||||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
a2 = a1 + lda;
|
||||
a3 = a2 + lda;
|
||||
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_r2 = 0.0;
|
||||
FLOAT temp_r3 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
FLOAT temp_i2 = 0.0;
|
||||
FLOAT temp_i3 = 0.0;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
|
||||
temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
|
||||
temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
|
||||
temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1];
|
||||
temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i];
|
||||
temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1];
|
||||
temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i];
|
||||
#else
|
||||
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
|
||||
temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
|
||||
temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
|
||||
temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1];
|
||||
temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i];
|
||||
temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1];
|
||||
temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 - alpha_i * temp_i2;
|
||||
y[5] += alpha_r * temp_i2 + alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 - alpha_i * temp_i3;
|
||||
y[7] += alpha_r * temp_i3 + alpha_i * temp_r3;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
y[4] += alpha_r * temp_r2 + alpha_i * temp_i2;
|
||||
y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2;
|
||||
y[6] += alpha_r * temp_r3 + alpha_i * temp_i3;
|
||||
y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x2_VEC
|
||||
|
||||
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
//p for positive(real*real,image*image) r for image (real*image,image*real)
|
||||
register __vector double vtemp0_p = {0.0, 0.0};
|
||||
register __vector double vtemp0_r = {0.0, 0.0};
|
||||
register __vector double vtemp1_p = {0.0, 0.0};
|
||||
register __vector double vtemp1_r = {0.0, 0.0};
|
||||
i = 0;
|
||||
n = n << 1;
|
||||
while (i < n) {
|
||||
|
||||
register __vector double vx_0 = *(__vector double*) (&x[i]);
|
||||
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
|
||||
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
|
||||
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
|
||||
|
||||
register __vector double va0 = *(__vector double*) (&a0[i]);
|
||||
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
|
||||
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
|
||||
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
|
||||
|
||||
register __vector double va1 = *(__vector double*) (&a1[i]);
|
||||
register __vector double va1_1 = *(__vector double*) (&a1[i + 2]);
|
||||
register __vector double va1_2 = *(__vector double*) (&a1[i + 4]);
|
||||
register __vector double va1_3 = *(__vector double*) (&a1[i + 6]);
|
||||
|
||||
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
|
||||
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
|
||||
|
||||
i += 8;
|
||||
|
||||
vtemp0_p += vx_0*va0;
|
||||
vtemp0_r += vxr_0*va0;
|
||||
|
||||
vtemp1_p += vx_0*va1;
|
||||
vtemp1_r += vxr_0*va1;
|
||||
|
||||
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
|
||||
vtemp0_p += vx_1*va0_1;
|
||||
vtemp0_r += vxr_1*va0_1;
|
||||
|
||||
vtemp1_p += vx_1*va1_1;
|
||||
vtemp1_r += vxr_1*va1_1;
|
||||
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
|
||||
|
||||
vtemp0_p += vx_2*va0_2;
|
||||
vtemp0_r += vxr_0*va0_2;
|
||||
|
||||
vtemp1_p += vx_2*va1_2;
|
||||
vtemp1_r += vxr_0*va1_2;
|
||||
|
||||
vtemp0_p += vx_3*va0_3;
|
||||
vtemp0_r += vxr_1*va0_3;
|
||||
|
||||
vtemp1_p += vx_3*va1_3;
|
||||
vtemp1_r += vxr_1*va1_3;
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
|
||||
|
||||
register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1];
|
||||
register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_r1 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
FLOAT temp_i1 = 0.0;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
|
||||
temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1];
|
||||
temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i];
|
||||
#else
|
||||
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
|
||||
temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1];
|
||||
temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y[3] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
y[2] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_KERNEL_4x1_VEC
|
||||
|
||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0 ;
|
||||
a0 = ap;
|
||||
//p for positive(real*real,image*image) r for image (real*image,image*real)
|
||||
register __vector double vtemp0_p = {0.0, 0.0};
|
||||
register __vector double vtemp0_r = {0.0, 0.0};
|
||||
i = 0;
|
||||
n = n << 1;
|
||||
while (i < n) {
|
||||
|
||||
register __vector double vx_0 = *(__vector double*) (&x[i]);
|
||||
register __vector double vx_1 = *(__vector double*) (&x[i + 2]);
|
||||
register __vector double vx_2 = *(__vector double*) (&x[i + 4]);
|
||||
register __vector double vx_3 = *(__vector double*) (&x[i + 6]);
|
||||
|
||||
register __vector double va0 = *(__vector double*) (&a0[i]);
|
||||
register __vector double va0_1 = *(__vector double*) (&a0[i + 2]);
|
||||
register __vector double va0_2 = *(__vector double*) (&a0[i + 4]);
|
||||
register __vector double va0_3 = *(__vector double*) (&a0[i + 6]);
|
||||
|
||||
register __vector double vxr_0 = vec_xxpermdi(vx_0, vx_0, 2);
|
||||
register __vector double vxr_1 = vec_xxpermdi(vx_1, vx_1, 2);
|
||||
|
||||
i += 8;
|
||||
|
||||
vtemp0_p += vx_0*va0;
|
||||
vtemp0_r += vxr_0*va0;
|
||||
|
||||
vxr_0 = vec_xxpermdi(vx_2, vx_2, 2);
|
||||
vtemp0_p += vx_1*va0_1;
|
||||
vtemp0_r += vxr_1*va0_1;
|
||||
|
||||
vxr_1 = vec_xxpermdi(vx_3, vx_3, 2);
|
||||
|
||||
vtemp0_p += vx_2*va0_2;
|
||||
vtemp0_r += vxr_0*va0_2;
|
||||
|
||||
vtemp0_p += vx_3*va0_3;
|
||||
vtemp0_r += vxr_1*va0_3;
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1];
|
||||
|
||||
#else
|
||||
register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1];
|
||||
register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1];
|
||||
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
FLOAT *a0;
|
||||
a0 = ap;
|
||||
|
||||
FLOAT temp_r0 = 0.0;
|
||||
FLOAT temp_i0 = 0.0;
|
||||
|
||||
for (i = 0; i < 2 * n; i += 2) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i];
|
||||
#else
|
||||
temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1];
|
||||
temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i];
|
||||
#endif
|
||||
}
|
||||
|
||||
#if !defined(XCONJ)
|
||||
|
||||
y[0] += alpha_r * temp_r0 - alpha_i * temp_i0;
|
||||
y[1] += alpha_r * temp_i0 + alpha_i * temp_r0;
|
||||
|
||||
#else
|
||||
|
||||
y[0] += alpha_r * temp_r0 + alpha_i * temp_i0;
|
||||
y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0;
|
||||
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
BLASLONG i;
|
||||
for (i = 0; i < n; i++) {
|
||||
*dest = *src;
|
||||
*(dest + 1) = *(src + 1);
|
||||
dest += 2;
|
||||
src += inc_src;
|
||||
}
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
||||
BLASLONG n1;
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
inc_x <<= 1;
|
||||
inc_y <<= 1;
|
||||
lda <<= 1;
|
||||
|
||||
xbuffer = buffer;
|
||||
|
||||
n1 = n >> 2;
|
||||
n2 = n & 3;
|
||||
|
||||
m3 = m & 3;
|
||||
m1 = m - m3;
|
||||
m2 = (m & (NBMAX - 1)) - m3;
|
||||
|
||||
BLASLONG NB = NBMAX;
|
||||
|
||||
while (NB == NBMAX) {
|
||||
|
||||
m1 -= NB;
|
||||
if (m1 < 0) {
|
||||
if (m2 == 0) break;
|
||||
NB = m2;
|
||||
}
|
||||
|
||||
y_ptr = y;
|
||||
a_ptr = a;
|
||||
x_ptr = x;
|
||||
|
||||
if (inc_x != 2)
|
||||
copy_x(NB, x_ptr, xbuffer, inc_x);
|
||||
else
|
||||
xbuffer = x_ptr;
|
||||
|
||||
if (inc_y == 2) {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 2;
|
||||
y_ptr += 8;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 2) {
|
||||
zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda << 1;
|
||||
y_ptr += 4;
|
||||
|
||||
}
|
||||
|
||||
if (n2 & 1) {
|
||||
zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr += 2;
|
||||
|
||||
}
|
||||
|
||||
} else {
|
||||
|
||||
for (i = 0; i < n1; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
|
||||
a_ptr += lda << 2;
|
||||
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[2];
|
||||
y_ptr[1] += ybuffer[3];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[4];
|
||||
y_ptr[1] += ybuffer[5];
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += ybuffer[6];
|
||||
y_ptr[1] += ybuffer[7];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
for (i = 0; i < n2; i++) {
|
||||
memset(ybuffer, 0, sizeof (ybuffer));
|
||||
zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i);
|
||||
a_ptr += lda;
|
||||
y_ptr[0] += ybuffer[0];
|
||||
y_ptr[1] += ybuffer[1];
|
||||
y_ptr += inc_y;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
a += 2 * NB;
|
||||
x += NB * inc_x;
|
||||
}
|
||||
|
||||
if (m3 == 0) return (0);
|
||||
|
||||
x_ptr = x;
|
||||
j = 0;
|
||||
a_ptr = a;
|
||||
y_ptr = y;
|
||||
|
||||
if (m3 == 3) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x4 = x_ptr[0];
|
||||
FLOAT x5 = x_ptr[1];
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 - a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 + a_ptr[5] * x4;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
temp_r += a_ptr[4] * x4 + a_ptr[5] * x5;
|
||||
temp_i += a_ptr[4] * x5 - a_ptr[5] * x4;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 2) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
x_ptr += inc_x;
|
||||
FLOAT x2 = x_ptr[0];
|
||||
FLOAT x3 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 - a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 + a_ptr[3] * x2;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
temp_r += a_ptr[2] * x2 + a_ptr[3] * x3;
|
||||
temp_i += a_ptr[2] * x3 - a_ptr[3] * x2;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
||||
|
||||
if (m3 == 1) {
|
||||
|
||||
FLOAT temp_r;
|
||||
FLOAT temp_i;
|
||||
FLOAT temp_r1;
|
||||
FLOAT temp_i1;
|
||||
FLOAT x0 = x_ptr[0];
|
||||
FLOAT x1 = x_ptr[1];
|
||||
|
||||
while (j < (n & -2)) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
a_ptr += lda;
|
||||
temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1;
|
||||
y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
y_ptr += inc_y;
|
||||
y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1;
|
||||
y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j += 2;
|
||||
}
|
||||
|
||||
while (j < n) {
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
temp_r = a_ptr[0] * x0 - a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 + a_ptr[1] * x0;
|
||||
#else
|
||||
|
||||
temp_r = a_ptr[0] * x0 + a_ptr[1] * x1;
|
||||
temp_i = a_ptr[0] * x1 - a_ptr[1] * x0;
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i;
|
||||
y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r;
|
||||
#else
|
||||
y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i;
|
||||
y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r;
|
||||
#endif
|
||||
|
||||
a_ptr += lda;
|
||||
y_ptr += inc_y;
|
||||
j++;
|
||||
}
|
||||
return (0);
|
||||
}
|
||||
|
||||
return (0);
|
||||
|
||||
}
|
||||
|
||||
265
kernel/power/zrot.c
Normal file
265
kernel/power/zrot.c
Normal file
@@ -0,0 +1,265 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2018, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA)
|
||||
{
|
||||
__vector double t0;
|
||||
__vector double t1;
|
||||
__vector double t2;
|
||||
__vector double t3;
|
||||
__vector double t4;
|
||||
__vector double t5;
|
||||
__vector double t6;
|
||||
__vector double t7;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords
|
||||
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords
|
||||
|
||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||
"lxvd2x 33, %[i16], %[x_ptr] \n\t"
|
||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
|
||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
|
||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
|
||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
|
||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
|
||||
|
||||
"addi %[x_ptr], %[x_ptr], 64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], 64 \n\t"
|
||||
|
||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||
"ble 2f \n\t"
|
||||
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp %x[x0], 48, 36 \n\t" // c * y
|
||||
"xvmuldp %x[x1], 49, 36 \n\t"
|
||||
"xvmuldp %x[x2], 50, 36 \n\t"
|
||||
"xvmuldp %x[x3], 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
|
||||
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x
|
||||
"lxvd2x 33, %[i16],%[x_ptr] \n\t"
|
||||
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvd2x 34, %[i32], %[x_ptr] \n\t"
|
||||
"lxvd2x 35, %[i48], %[x_ptr] \n\t"
|
||||
|
||||
"xvmuldp %x[x4], 48, 37 \n\t" // s * y
|
||||
"xvmuldp %x[x5], 49, 37 \n\t"
|
||||
|
||||
"lxvd2x 48, 0, %[y_ptr] \n\t" // load y
|
||||
"lxvd2x 49, %[i16], %[y_ptr] \n\t"
|
||||
|
||||
"xvmuldp %x[x6], 50, 37 \n\t"
|
||||
"xvmuldp %x[x7], 51, 37 \n\t"
|
||||
|
||||
"lxvd2x 50, %[i32], %[y_ptr] \n\t"
|
||||
"lxvd2x 51, %[i48], %[y_ptr] \n\t"
|
||||
|
||||
"xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y
|
||||
|
||||
"addi %[x_ptr], %[x_ptr], -64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], -64 \n\t"
|
||||
|
||||
"xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y
|
||||
|
||||
"xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
|
||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
|
||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
|
||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
|
||||
|
||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
|
||||
"stxvd2x %x[x1], %[i16], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x2], %[i32], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t"
|
||||
|
||||
"addi %[x_ptr], %[x_ptr], 128 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], 128 \n\t"
|
||||
|
||||
"addic. %[temp_n], %[temp_n], -4 \n\t"
|
||||
"bgt+ 1b \n"
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp %x[x0], 48, 36 \n\t" // c * y
|
||||
"xvmuldp %x[x1], 49, 36 \n\t"
|
||||
"xvmuldp %x[x2], 50, 36 \n\t"
|
||||
"xvmuldp %x[x3], 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"xvmuldp %x[x4], 48, 37 \n\t" // s * y
|
||||
"xvmuldp %x[x5], 49, 37 \n\t"
|
||||
"xvmuldp %x[x6], 50, 37 \n\t"
|
||||
"xvmuldp %x[x7], 51, 37 \n\t"
|
||||
|
||||
"addi %[x_ptr], %[x_ptr], -64 \n\t"
|
||||
"addi %[y_ptr], %[y_ptr], -64 \n\t"
|
||||
|
||||
"xvadddp 40, 40, %x[x4] \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, %x[x5] \n\t" // c * x + s * y
|
||||
"xvadddp 42, 42, %x[x6] \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, %x[x7] \n\t" // c * x + s * y
|
||||
|
||||
"xvsubdp %x[x0], %x[x0], 44 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x1], %x[x1], 45 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x2], %x[x2], 46 \n\t" // c * y - s * x
|
||||
"xvsubdp %x[x3], %x[x3], 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvd2x 40, 0, %[x_ptr] \n\t" // store x
|
||||
"stxvd2x 41, %[i16], %[x_ptr] \n\t"
|
||||
"stxvd2x 42, %[i32], %[x_ptr] \n\t"
|
||||
"stxvd2x 43, %[i48], %[x_ptr] \n\t"
|
||||
|
||||
"stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y
|
||||
"stxvd2x %x[x1], %[i16], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x2], %[i32], %[y_ptr] \n\t"
|
||||
"stxvd2x %x[x3], %[i48], %[y_ptr] \n\t"
|
||||
|
||||
|
||||
:
|
||||
[mem_x] "+m" (*(double (*)[2*n])x),
|
||||
[mem_y] "+m" (*(double (*)[2*n])y),
|
||||
[temp_n] "+&r" (n),
|
||||
[x_ptr] "+&b"(x), [y_ptr] "+&b"(y),
|
||||
[x0] "=wa" (t0),
|
||||
[x1] "=wa" (t1),
|
||||
[x2] "=wa" (t2),
|
||||
[x3] "=wa" (t3),
|
||||
[x4] "=wa" (t4),
|
||||
[x5] "=wa" (t5),
|
||||
[x6] "=wa" (t6),
|
||||
[x7] "=wa" (t7)
|
||||
:
|
||||
[cos] "d" (cosA),
|
||||
[sin] "d" (sinA),
|
||||
[i16] "b" (16),
|
||||
[i32] "b" (32),
|
||||
[i48] "b" (48)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
|
||||
return;
|
||||
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
|
||||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0,iy=0;
|
||||
FLOAT temp[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -4;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zrot_kernel_4(n1, x, y, c, s);
|
||||
i=n1;
|
||||
ix=2*n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[ix] ;
|
||||
temp[1] = c*x[ix+1] + s*y[ix+1] ;
|
||||
y[ix] = c*y[ix] - s*x[ix] ;
|
||||
y[ix+1] = c*y[ix+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += 2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x ;
|
||||
inc_y2 = 2 * inc_y ;
|
||||
while(i < n)
|
||||
{
|
||||
temp[0] = c*x[ix] + s*y[iy] ;
|
||||
temp[1] = c*x[ix+1] + s*y[iy+1] ;
|
||||
y[iy] = c*y[iy] - s*x[ix] ;
|
||||
y[iy+1] = c*y[iy+1] - s*x[ix+1] ;
|
||||
x[ix] = temp[0] ;
|
||||
x[ix+1] = temp[1] ;
|
||||
|
||||
ix += inc_x2 ;
|
||||
iy += inc_y2 ;
|
||||
i++ ;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
return(0);
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -647,7 +647,9 @@ static int get_l2_size_old(void){
|
||||
return 6144;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
// return 0;
|
||||
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
|
||||
return 256;
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -660,6 +662,10 @@ static __inline__ int get_l2_size(void){
|
||||
l2 = BITMASK(ecx, 16, 0xffff);
|
||||
|
||||
#ifndef ARCH_X86
|
||||
if (l2 <= 0) {
|
||||
fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
|
||||
return 256;
|
||||
}
|
||||
return l2;
|
||||
|
||||
#else
|
||||
@@ -871,6 +877,22 @@ static void init_parameter(void) {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SKYLAKEX
|
||||
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr, "SkylakeX\n");
|
||||
#endif
|
||||
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
#ifdef EXPRECISION
|
||||
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef OPTERON
|
||||
|
||||
|
||||
@@ -169,7 +169,7 @@ ifndef ZDOTKERNEL
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
|
||||
DSDOTKERNEL = ../arm/dot.c
|
||||
DSDOTKERNEL = ../generic/dot.c
|
||||
|
||||
# Bug in znrm2 assembler kernel
|
||||
ifndef ZNRM2KERNEL
|
||||
|
||||
@@ -1,3 +1 @@
|
||||
include $(KERNELDIR)/KERNEL.PENRYN
|
||||
SSWAPKERNEL = ../arm/swap.c
|
||||
DSWAPKERNEL = ../arm/swap.c
|
||||
|
||||
@@ -138,6 +138,14 @@
|
||||
/* INCX != 1 or INCY != 1 */
|
||||
|
||||
.L14:
|
||||
cmpl $0, %ebx
|
||||
jne .L141
|
||||
cmpl $0, %ecx
|
||||
jne .L141
|
||||
/* INCX == 0 and INCY == 0 */
|
||||
jmp .L27
|
||||
|
||||
.L141:
|
||||
movl %edx, %eax
|
||||
sarl $2, %eax
|
||||
jle .L28
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht0
|
||||
#define PREFETCHSIZE (8 * 21 + 4)
|
||||
#endif
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht1
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht1
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht1
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
@@ -63,7 +63,7 @@
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht1
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL)
|
||||
#if defined(NEHALEM) || defined(SANDYBRIDGE) || defined(HASWELL) || defined (SKYLAKEX)
|
||||
#define PREFETCH prefetcht1
|
||||
#define PREFETCHSIZE 84
|
||||
#endif
|
||||
|
||||
19
kernel/x86_64/KERNEL.SKYLAKEX
Normal file
19
kernel/x86_64/KERNEL.SKYLAKEX
Normal file
@@ -0,0 +1,19 @@
|
||||
include $(KERNELDIR)/KERNEL.HASWELL
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x4_skylakex.S
|
||||
|
||||
|
||||
DTRMMKERNEL = ../generic/trmmkernel_16x2.c
|
||||
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
SGEMM_BETA = ../generic/gemm_beta.c
|
||||
DGEMM_BETA = ../generic/gemm_beta.c
|
||||
@@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include "caxpy_microk_steamroller-2.c"
|
||||
#elif defined(BULLDOZER)
|
||||
#include "caxpy_microk_bulldozer-2.c"
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#elif defined(HASWELL) || defined(ZEN) || defined(SKYLAKEX)
|
||||
#include "caxpy_microk_haswell-2.c"
|
||||
#elif defined(SANDYBRIDGE)
|
||||
#include "caxpy_microk_sandy-2.c"
|
||||
|
||||
@@ -50,11 +50,11 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vmulps (%5), %%ymm0 , %%ymm0 \n\t"
|
||||
#endif
|
||||
|
||||
".align 16 \n\t"
|
||||
".p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
"vmovups (%2,%0,4), %%ymm5 \n\t" // 4 complex values from x
|
||||
".align 2 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"vmovups 32(%2,%0,4), %%ymm7 \n\t" // 4 complex values from x
|
||||
"vmovups 64(%2,%0,4), %%ymm9 \n\t" // 4 complex values from x
|
||||
"vmovups 96(%2,%0,4), %%ymm11 \n\t" // 4 complex values from x
|
||||
@@ -70,7 +70,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vpermilps $0xb1 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part
|
||||
|
||||
"vfmadd213ps (%3,%0,4), %%ymm0 , %%ymm5 \n\t"
|
||||
".align 2 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"vfmadd213ps 32(%3,%0,4), %%ymm0 , %%ymm7 \n\t"
|
||||
"vfmadd213ps 64(%3,%0,4), %%ymm0 , %%ymm9 \n\t"
|
||||
"vfmadd213ps 96(%3,%0,4), %%ymm0 , %%ymm11 \n\t"
|
||||
@@ -96,7 +96,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
"vfmadd231ps %%ymm1 , %%ymm10, %%ymm15 \n\t"
|
||||
|
||||
"vmovups %%ymm5 , (%3,%0,4) \n\t"
|
||||
".align 2 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"vmovups %%ymm7 , 32(%3,%0,4) \n\t"
|
||||
"vmovups %%ymm9 , 64(%3,%0,4) \n\t"
|
||||
"vmovups %%ymm11, 96(%3,%0,4) \n\t"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user