Compare commits
256 Commits
revert-214
...
revert-232
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
83dae28ae2 | ||
|
|
da986d2e83 | ||
|
|
6bc487de35 | ||
|
|
f95989cbc1 | ||
|
|
04226f1e97 | ||
|
|
0925ef70db | ||
|
|
371e6f73d4 | ||
|
|
d117dfd505 | ||
|
|
883c39773a | ||
|
|
b09b5be0a4 | ||
|
|
bfb5fbdb4d | ||
|
|
3da6d66da9 | ||
|
|
08fa83aba2 | ||
|
|
63d3ee8dfc | ||
|
|
1191db1a49 | ||
|
|
1f6071590d | ||
|
|
0caf1434c9 | ||
|
|
73128f3883 | ||
|
|
cad0d150db | ||
|
|
eba0aeb7cd | ||
|
|
0c07c356c1 | ||
|
|
82b75f97e5 | ||
|
|
7887c45077 | ||
|
|
3e67017ac8 | ||
|
|
b3ac6ee222 | ||
|
|
6082e556cd | ||
|
|
92315173d5 | ||
|
|
351d12b94e | ||
|
|
bf73aa141b | ||
|
|
71e96163db | ||
|
|
819e852ae7 | ||
|
|
4e466d739c | ||
|
|
4c6a457358 | ||
|
|
836c414e22 | ||
|
|
d403eb3c2f | ||
|
|
3cd97f1a80 | ||
|
|
9955f0996f | ||
|
|
430c11e135 | ||
|
|
fbacd2605d | ||
|
|
6fa89b06a1 | ||
|
|
68597002ea | ||
|
|
d2a6285549 | ||
|
|
d999688d1a | ||
|
|
928fe1b28e | ||
|
|
ccc28c6d60 | ||
|
|
ae43b75a6a | ||
|
|
54fc06fd70 | ||
|
|
1df9a2013d | ||
|
|
274ff5cdb8 | ||
|
|
eb2eddf241 | ||
|
|
8691825944 | ||
|
|
7dc8a76f60 | ||
|
|
df857551c0 | ||
|
|
85ccdce8c4 | ||
|
|
aeabe0a83f | ||
|
|
1b90989662 | ||
|
|
e3e8b5cdca | ||
|
|
69b16a894d | ||
|
|
6782e5767d | ||
|
|
48f5a89f92 | ||
|
|
4ae1610f37 | ||
|
|
911c3e2f4b | ||
|
|
fab49e49e5 | ||
|
|
b687fba5bc | ||
|
|
46a8c2519a | ||
|
|
e9437eebd2 | ||
|
|
3a39062cfc | ||
|
|
eaa0be1313 | ||
|
|
6ff013bae0 | ||
|
|
0d669e04bb | ||
|
|
17cdd9f9e1 | ||
|
|
6bcb06fcb1 | ||
|
|
b7315f8401 | ||
|
|
9b19e9e1b0 | ||
|
|
6bd67ddbab | ||
|
|
5da9484d93 | ||
|
|
844629af57 | ||
|
|
2beaa82c05 | ||
|
|
e8a2aed2b9 | ||
|
|
f262031685 | ||
|
|
5f6206fa2d | ||
|
|
f2cde2ccfb | ||
|
|
ba7838d2e1 | ||
|
|
a448884a63 | ||
|
|
17609f88f1 | ||
|
|
3a2df19db6 | ||
|
|
d2093a40d3 | ||
|
|
aa04b0925e | ||
|
|
258ac56e0a | ||
|
|
56837e9d92 | ||
|
|
bb5413863f | ||
|
|
32f5907fef | ||
|
|
ac10236cc8 | ||
|
|
8617d75548 | ||
|
|
c07d78b9e9 | ||
|
|
6355c25dde | ||
|
|
5e244d80f2 | ||
|
|
ede5efebab | ||
|
|
84908d60d2 | ||
|
|
596a22325a | ||
|
|
7f58f3ad0e | ||
|
|
c0d570a357 | ||
|
|
6b83079368 | ||
|
|
673e5a0495 | ||
|
|
bfa2cc7d64 | ||
|
|
e7c4d6705a | ||
|
|
2a1911cc14 | ||
|
|
9f7a9a32e3 | ||
|
|
5d6525c87c | ||
|
|
6cb47ea3f0 | ||
|
|
459bb9291d | ||
|
|
3f1077ce6f | ||
|
|
eb45eb6942 | ||
|
|
f2becb777a | ||
|
|
5997b6b491 | ||
|
|
4b21b646ea | ||
|
|
7ec7b999a5 | ||
|
|
af9ac0898a | ||
|
|
c7b5a459b6 | ||
|
|
9b2f0323d6 | ||
|
|
9f6984fe4b | ||
|
|
42203dafdc | ||
|
|
a4f17a9297 | ||
|
|
733d97b2df | ||
|
|
ea747cf933 | ||
|
|
4de545aa7d | ||
|
|
6e9a93ec19 | ||
|
|
fde8a8e6a0 | ||
|
|
256fc15f5f | ||
|
|
ee498525e0 | ||
|
|
1fec0570f6 | ||
|
|
b5af7b9c78 | ||
|
|
f3c314550c | ||
|
|
847c20c9b7 | ||
|
|
4c22828812 | ||
|
|
e79712d969 | ||
|
|
be09551cdf | ||
|
|
ec1ef6aa9e | ||
|
|
11c59acfb1 | ||
|
|
bf0d92a310 | ||
|
|
db066151ee | ||
|
|
3a55dca2dc | ||
|
|
7d380f7d79 | ||
|
|
300f158d3b | ||
|
|
3635fdbf2b | ||
|
|
b6552b11eb | ||
|
|
5fdf9ad24f | ||
|
|
2fe967c542 | ||
|
|
6d8595351c | ||
|
|
f40200f559 | ||
|
|
a95a5e52b8 | ||
|
|
e3d846ab57 | ||
|
|
8506386d82 | ||
|
|
9ef96b32a6 | ||
|
|
b48c025974 | ||
|
|
a1fce67743 | ||
|
|
103b32fdb7 | ||
|
|
aef9804089 | ||
|
|
303869f572 | ||
|
|
02d9203981 | ||
|
|
7b6808b69c | ||
|
|
321288597c | ||
|
|
be147a9f28 | ||
|
|
c275290ea6 | ||
|
|
b7bbb02447 | ||
|
|
bf1430f7d7 | ||
|
|
dccff2e785 | ||
|
|
5c3458a6e7 | ||
|
|
1776ad82c0 | ||
|
|
4e2f81cfa1 | ||
|
|
acf6002ab2 | ||
|
|
96a794e9fd | ||
|
|
3d36c45116 | ||
|
|
648491e1aa | ||
|
|
2dfb804cb9 | ||
|
|
4c153ec9da | ||
|
|
7eecd8e39c | ||
|
|
f0406a7708 | ||
|
|
561f3fd995 | ||
|
|
30efed14d1 | ||
|
|
af2e7f28fc | ||
|
|
4250e6ed64 | ||
|
|
7b0b7c11d2 | ||
|
|
d14cf1ccf4 | ||
|
|
3f6ab1582a | ||
|
|
28e96458e5 | ||
|
|
95fb98f556 | ||
|
|
4801c6d36b | ||
|
|
9440fa607d | ||
|
|
94db259e5b | ||
|
|
f49f8047ac | ||
|
|
825777faab | ||
|
|
9c89757562 | ||
|
|
b0b7600bef | ||
|
|
9b04baeaee | ||
|
|
8a074b3965 | ||
|
|
211ab03b14 | ||
|
|
1733f927e6 | ||
|
|
182b06d6ad | ||
|
|
7a9050d681 | ||
|
|
0ba29fd262 | ||
|
|
bafa021ed6 | ||
|
|
b89d9762a2 | ||
|
|
08dedf4c5e | ||
|
|
b89c781637 | ||
|
|
dd7ff77f4b | ||
|
|
8fb76134bc | ||
|
|
04d671aae2 | ||
|
|
f69a0be712 | ||
|
|
ae9e8b131e | ||
|
|
9086543f50 | ||
|
|
abea977ded | ||
|
|
6b6c9b1441 | ||
|
|
a97b301aaa | ||
|
|
2f13f04224 | ||
|
|
7c7505a778 | ||
|
|
5a4f1a2118 | ||
|
|
3b761892df | ||
|
|
eebfeba768 | ||
|
|
7684c4f8f8 | ||
|
|
7faf42b7bb | ||
|
|
a575f1e4c7 | ||
|
|
cdbfb891da | ||
|
|
280552b988 | ||
|
|
bbd4bb0154 | ||
|
|
6d3efb2b58 | ||
|
|
d9ff2cd90d | ||
|
|
2a43062de7 | ||
|
|
4ea794a522 | ||
|
|
ece0bfb881 | ||
|
|
1f4b6a5d5d | ||
|
|
be8f70d269 | ||
|
|
e674e1c735 | ||
|
|
6ca898b63b | ||
|
|
26411acd56 | ||
|
|
0ab4076dd8 | ||
|
|
a0caa762b3 | ||
|
|
900d5a3205 | ||
|
|
a17cf36225 | ||
|
|
148c4cc5fd | ||
|
|
d0c3543c3f | ||
|
|
909ad04aef | ||
|
|
417efd41c6 | ||
|
|
9cdc828afa | ||
|
|
7a9a4dbc4f | ||
|
|
a469b32cf4 | ||
|
|
27649b9543 | ||
|
|
16f3df5d35 | ||
|
|
1aded69821 | ||
|
|
c00289ba54 | ||
|
|
8fe794f059 | ||
|
|
74c10b57c6 | ||
|
|
c5495d2056 | ||
|
|
c70496b108 | ||
|
|
ca8d8835f5 | ||
|
|
5a9cce2bf6 |
16
.travis.yml
16
.travis.yml
@@ -17,7 +17,7 @@ matrix:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
@@ -162,16 +162,24 @@ matrix:
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="BINARY=32 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- COMMON_FLAGS="NUM_THREADS=32"
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
||||
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
||||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 7.dev)
|
||||
set(OpenBLAS_PATCH_VERSION 8.dev)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
@@ -211,7 +211,8 @@ if (USE_THREAD)
|
||||
target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif()
|
||||
|
||||
if (MSVC OR NOT NOFORTRAN)
|
||||
#if (MSVC OR NOT NOFORTRAN)
|
||||
if (NOT NO_CBLAS)
|
||||
# Broken without fortran on unix
|
||||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
@@ -1,4 +1,46 @@
|
||||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.7
|
||||
11-Aug 2019
|
||||
|
||||
common:
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
|
||||
x86_64:
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
|
||||
## POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
|
||||
## ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
29-Apr-2019
|
||||
|
||||
6
Makefile
6
Makefile
@@ -34,7 +34,7 @@ endif
|
||||
|
||||
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS))
|
||||
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench
|
||||
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
|
||||
|
||||
.PHONY : all libs netlib $(RELA) test ctest shared install
|
||||
.NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test
|
||||
@@ -109,6 +109,7 @@ endif
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
@$(MAKE) -C exports dyn
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
@ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@$(MAKE) -C exports dll
|
||||
@@ -127,6 +128,9 @@ endif
|
||||
$(MAKE) -C utest all
|
||||
ifndef NO_CBLAS
|
||||
$(MAKE) -C ctest all
|
||||
ifeq ($(CPP_THREAD_SAFETY_TEST), 1)
|
||||
$(MAKE) -C cpp_thread_test all
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
13
Makefile.arm
13
Makefile.arm
@@ -1,7 +1,7 @@
|
||||
ifeq ($(CORE), $(filter $(CORE),ARMV7 CORTEXA9 CORTEXA15))
|
||||
ifeq ($(OSNAME), Android)
|
||||
CCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=neon -march=armv7-a
|
||||
CCOMMON_OPT += -mfpu=neon
|
||||
FCOMMON_OPT += -mfpu=neon
|
||||
else
|
||||
CCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
FCOMMON_OPT += -mfpu=vfpv3 -march=armv7-a
|
||||
@@ -9,11 +9,6 @@ endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV6)
|
||||
CCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
FCOMMON_OPT += -mfpu=vfp -march=armv6
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), ARMV5)
|
||||
CCOMMON_OPT += -march=armv5
|
||||
FCOMMON_OPT += -march=armv5
|
||||
CCOMMON_OPT += -mfpu=vfp
|
||||
FCOMMON_OPT += -mfpu=vfp
|
||||
endif
|
||||
|
||||
@@ -83,7 +83,8 @@ ifeq ($(OSNAME), Darwin)
|
||||
@-cp $(LIBDYNNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-install_name_tool -id "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)" "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)/$(LIBDYNNAME)"
|
||||
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).dylib ; \
|
||||
ln -fs $(LIBDYNNAME) $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
@-cp $(LIBDLLNAME) "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
|
||||
@@ -29,6 +29,10 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas
|
||||
endif
|
||||
endif
|
||||
|
||||
# workaround for C->FORTRAN ABI violation in LAPACKE
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
endif
|
||||
|
||||
FLAMEPATH = $(HOME)/flame/lib
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.7.dev
|
||||
VERSION = 0.3.8.dev
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
@@ -163,6 +163,10 @@ NO_AFFINITY = 1
|
||||
# Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
|
||||
# NO_AVX2 = 1
|
||||
|
||||
# Don't use SkylakeX optimizations if binutils or compiler are too old (the build
|
||||
# system will try to determine this automatically)
|
||||
# NO_AVX512 = 1
|
||||
|
||||
# Don't use parallel make.
|
||||
# NO_PARALLEL_MAKE = 1
|
||||
|
||||
@@ -245,6 +249,21 @@ COMMON_PROF = -pg
|
||||
# SYMBOLPREFIX=
|
||||
# SYMBOLSUFFIX=
|
||||
|
||||
# Run a C++ based thread safety tester after the build is done.
|
||||
# This is mostly intended as a developer feature to spot regressions, but users and
|
||||
# package maintainers can enable this if they have doubts about the thread safety of
|
||||
# the library, given the configuration in this file.
|
||||
# By default, the thread safety tester launches 52 concurrent calculations at the same
|
||||
# time.
|
||||
#
|
||||
# Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
|
||||
#
|
||||
# The test requires CBLAS to be built, a C++11 capable compiler and the presence of
|
||||
# an OpenMP implementation. If you are cross-compiling this test will probably not
|
||||
# work at all.
|
||||
#
|
||||
# CPP_THREAD_SAFETY_TEST = 1
|
||||
|
||||
#
|
||||
# End of user configuration
|
||||
#
|
||||
|
||||
@@ -9,6 +9,13 @@ ifndef TOPDIR
|
||||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
ifeq ($(ARCH), amd64)
|
||||
override ARCH=x86_64
|
||||
@@ -137,7 +144,12 @@ endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
@@ -257,9 +269,10 @@ OBJCOPY = $(CROSS_SUFFIX)objcopy
|
||||
OBJCONV = $(CROSS_SUFFIX)objconv
|
||||
|
||||
|
||||
# For detect fortran failed, only build BLAS.
|
||||
# When fortran support was either not detected or actively deselected, only build BLAS.
|
||||
ifeq ($(NOFORTRAN), 1)
|
||||
NO_LAPACK = 1
|
||||
override FEXTRALIB =
|
||||
endif
|
||||
|
||||
#
|
||||
@@ -309,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
@@ -541,8 +555,17 @@ endif
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
@@ -686,7 +709,7 @@ endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
@@ -746,6 +769,9 @@ else
|
||||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
@@ -754,6 +780,8 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
# work around ABI problem with passing single-character arguments
|
||||
FCOMMON_OPT += -fno-optimize-sibling-calls
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
EXTRALIB += -lgfortran
|
||||
@@ -1059,7 +1087,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3
|
||||
CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3
|
||||
endif
|
||||
|
||||
ifdef USE_TLS
|
||||
ifeq ($(USE_TLS), 1)
|
||||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
@@ -1112,8 +1140,12 @@ endif
|
||||
endif
|
||||
|
||||
ifdef NO_AFFINITY
|
||||
ifeq ($(NO_AFFINITY), 0)
|
||||
override undefine NO_AFFINITY
|
||||
else
|
||||
CCOMMON_OPT += -DNO_AFFINITY
|
||||
endif
|
||||
endif
|
||||
|
||||
ifdef FUNCTION_PROFILE
|
||||
CCOMMON_OPT += -DFUNCTION_PROFILE
|
||||
|
||||
@@ -28,11 +28,15 @@ endif
|
||||
ifeq ($(CORE), HASWELL)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX2
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
CCOMMON_OPT += -mavx2
|
||||
endif
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
18
appveyor.yml
18
appveyor.yml
@@ -35,7 +35,15 @@ environment:
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
- COMPILER: cl
|
||||
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-5.3.0
|
||||
WITH_FORTRAN: ignore
|
||||
|
||||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
@@ -52,7 +60,14 @@ install:
|
||||
before_build:
|
||||
- ps: if (-Not (Test-Path .\build)) { mkdir build }
|
||||
- cd build
|
||||
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
@@ -64,3 +79,4 @@ test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
|
||||
|
||||
@@ -207,7 +207,7 @@ int main(int argc, char *argv[]){
|
||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
|
||||
fprintf(stderr, " SIZE Flops Time\n");
|
||||
|
||||
for (i = from; i <= to; i += step) {
|
||||
|
||||
15
c_check
15
c_check
@@ -240,7 +240,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
||||
} else {
|
||||
$no_avx512 = 0;
|
||||
}
|
||||
unlink("tmpf.o");
|
||||
unlink("$tmpf.o");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -260,6 +260,19 @@ if ($architecture ne $hostarch) {
|
||||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
|
||||
# the initial autodetection will have been confused by the command-line arguments to clang
|
||||
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
|
||||
if (($os eq "Darwin") && ($cross_suffix ne "")) {
|
||||
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
|
||||
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
|
||||
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
|
||||
$cross =1;
|
||||
$architecture = arm64;
|
||||
}
|
||||
|
||||
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
||||
@@ -73,14 +73,16 @@ if (DYNAMIC_ARCH)
|
||||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
endif ()
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT DYNAMIC_CORE)
|
||||
unset(DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options")
|
||||
unset(DYNAMIC_ARCH CACHE)
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||
else ()
|
||||
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
else ()
|
||||
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
|
||||
if (MIPS64)
|
||||
|
||||
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||
if (X86)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||
|
||||
@@ -44,7 +44,10 @@ endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
set(EXTRALIB "{EXTRALIB} -lgfortran")
|
||||
|
||||
@@ -59,6 +59,9 @@ set(FU "")
|
||||
if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang"))
|
||||
set(FU "_")
|
||||
endif()
|
||||
if(MINGW AND NOT MINGW64)
|
||||
set(FU "_")
|
||||
endif()
|
||||
|
||||
set(COMPILER_ID ${CMAKE_C_COMPILER_ID})
|
||||
if (${COMPILER_ID} STREQUAL "GNU")
|
||||
@@ -82,6 +85,11 @@ endif ()
|
||||
# f_check
|
||||
if (NOT NOFORTRAN)
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake")
|
||||
else ()
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define BUNDERSCORE _\n"
|
||||
"#define NEEDBUNDERSCORE 1\n")
|
||||
set(BU "_")
|
||||
endif ()
|
||||
|
||||
# Cannot run getarch on target if we are cross-compiling
|
||||
@@ -97,8 +105,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${TCORE}\n"
|
||||
"#define CORE_${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "ARMV7")
|
||||
if ("${TCORE}" STREQUAL "CORE2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
@@ -113,6 +152,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
@@ -301,6 +344,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
|
||||
@@ -65,6 +65,18 @@ if (DEFINED TARGET)
|
||||
set(GETARCH_FLAGS "-DFORCE_${TARGET}")
|
||||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
# On x86 no AVX support is available
|
||||
if (X86 OR X86_64)
|
||||
if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4"))
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (INTERFACE64)
|
||||
message(STATUS "Using 64-bit integers.")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT")
|
||||
@@ -143,7 +155,9 @@ else()
|
||||
endif ()
|
||||
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
|
||||
if (DEFINED BINARY)
|
||||
message(STATUS "Compiling a ${BINARY}-bit binary.")
|
||||
endif ()
|
||||
if (NOT DEFINED NEED_PIC)
|
||||
set(NEED_PIC 1)
|
||||
endif ()
|
||||
@@ -160,6 +174,9 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake")
|
||||
if (NOT NOFORTRAN)
|
||||
# Fortran Compiler dependent settings
|
||||
include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake")
|
||||
else ()
|
||||
set(NO_LAPACK 1)
|
||||
set(NO_LAPACKE 1)
|
||||
endif ()
|
||||
|
||||
if (BINARY64)
|
||||
@@ -185,9 +202,14 @@ if (NEED_PIC)
|
||||
endif ()
|
||||
|
||||
if (DYNAMIC_ARCH)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
if (X86 OR X86_64 OR ARM64 OR PPC)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
|
||||
if (DYNAMIC_OLDER)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
|
||||
endif ()
|
||||
else ()
|
||||
unset (DYNAMIC_ARCH)
|
||||
message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ if (${HOST_OS} STREQUAL "LINUX")
|
||||
EXECUTE_PROCESS( COMMAND uname -o COMMAND tr -d '\n' OUTPUT_VARIABLE OPERATING_SYSTEM)
|
||||
if(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
set(HOST_OS ANDROID)
|
||||
endif(${OPERATING_SYSTEM} MATCHES "Android")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
else()
|
||||
set(X86 1)
|
||||
if (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
@@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
||||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
elseif (${CMAKE_CROSSCOMPILING})
|
||||
if (${TARGET} STREQUAL "CORE2")
|
||||
if (NOT BINARY)
|
||||
set(X86 1)
|
||||
elseif (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
set(ARM64 1)
|
||||
endif ()
|
||||
else ()
|
||||
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
@@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
||||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
||||
@@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
||||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
@@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
||||
146
common_lapack.h
146
common_lapack.h
@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
||||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
#endif
|
||||
|
||||
165
common_macro.h
165
common_macro.h
@@ -641,7 +641,7 @@
|
||||
#define IMATCOPY_K_CT DIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
@@ -944,7 +944,7 @@
|
||||
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
@@ -1770,7 +1770,7 @@
|
||||
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K ZGEADD_K
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
@@ -2193,7 +2193,7 @@
|
||||
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
@@ -2806,3 +2806,160 @@ typedef struct {
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
@@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define PROLOGUE \
|
||||
.section .text;\
|
||||
@@ -784,7 +784,7 @@ Lmcount$lazy_ptr:
|
||||
|
||||
#define HALT mfspr r0, 1023
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#if defined(PPC440) || defined(PPC440FP2)
|
||||
#undef MAX_CPU_NUMBER
|
||||
#define MAX_CPU_NUMBER 1
|
||||
@@ -829,7 +829,7 @@ Lmcount$lazy_ptr:
|
||||
#define MAP_ANONYMOUS MAP_ANON
|
||||
#endif
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#if defined(OS_LINUX) || defined(OS_FREEBSD)
|
||||
#ifndef __64BIT__
|
||||
#define FRAMESLOT(X) (((X) * 4) + 8)
|
||||
#else
|
||||
|
||||
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
||||
@@ -129,12 +129,13 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
|
||||
*ecx=cpuinfo[2];
|
||||
*edx=cpuinfo[3];
|
||||
#else
|
||||
__asm__ __volatile__("cpuid"
|
||||
__asm__ __volatile__("mov $0, %%ecx;"
|
||||
"cpuid"
|
||||
: "=a" (*eax),
|
||||
"=b" (*ebx),
|
||||
"=c" (*ecx),
|
||||
"=d" (*edx)
|
||||
: "0" (op), "c"(0));
|
||||
: "0" (op));
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
14
cpp_thread_test/Makefile
Normal file
14
cpp_thread_test/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
include ../Makefile.rule
|
||||
|
||||
all :: dgemv_tester dgemm_tester
|
||||
|
||||
dgemv_tester :
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester
|
||||
./dgemv_tester
|
||||
|
||||
dgemm_tester : dgemv_tester
|
||||
$(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester
|
||||
./dgemm_tester
|
||||
|
||||
clean ::
|
||||
rm -f dgemv_tester dgemm_tester
|
||||
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
55
cpp_thread_test/cpp_thread_safety_common.h
Normal file
@@ -0,0 +1,55 @@
|
||||
inline void pauser(){
|
||||
/// a portable way to pause a program
|
||||
std::string dummy;
|
||||
std::cout << "Press enter to continue...";
|
||||
std::getline(std::cin, dummy);
|
||||
}
|
||||
|
||||
void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for(uint32_t i=0; i<numMat; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
matBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){
|
||||
for(uint32_t j=0; j<numMat; j++){
|
||||
matBlock[i+j] = matBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void FillVectors(std::vector<std::vector<double>>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){
|
||||
for(uint32_t i=0; i<numVec; i++){
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
vecBlock[i][j] = rngdist(PRNG);
|
||||
}
|
||||
}
|
||||
for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){
|
||||
for(uint32_t j=0; j<numVec; j++){
|
||||
vecBlock[i+j] = vecBlock[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::mt19937_64 InitPRNG(){
|
||||
std::random_device rd;
|
||||
std::mt19937_64 PRNG(rd()); //seed PRNG using /dev/urandom or similar OS provided RNG
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
//make sure the internal state of the PRNG is properly mixed by generating 10M random numbers
|
||||
//PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed
|
||||
for (uint32_t i=0;i<10000000;i++) rngdist(PRNG);
|
||||
return PRNG;
|
||||
}
|
||||
|
||||
void PrintMatrices(const std::vector<std::vector<double>>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){
|
||||
for (uint32_t i=0;i<numConcurrentThreads*numMat;i++){
|
||||
std::cout<<i<<std::endl;
|
||||
for (uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
for (uint32_t k = 0; k < static_cast<uint32_t>(randomMatSize); k++){
|
||||
std::cout<<matBlock[i][j*randomMatSize + k]<<" ";
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
std::cout<<std::endl;
|
||||
}
|
||||
}
|
||||
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
92
cpp_thread_test/dgemm_thread_safety.cpp
Normal file
@@ -0,0 +1,92 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){
|
||||
cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs[0]);
|
||||
numConcurrentThreads = std::stoul(cliArgs[1]);
|
||||
numTestRounds = std::stoul(cliArgs[2]);
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads*3);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMM thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices(N=M=K): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMM thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*3); i++){
|
||||
matBlock[i].resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 3);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Testing CBLAS DGEMM thread safety\n";
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMM round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemm, &matBlock[i*3][0], &matBlock[i*3+1][0], &matBlock[i*3+2][0], randomMatSize);
|
||||
//launch_cblas_dgemm( &matBlock[i][0], &matBlock[i+1][0], &matBlock[i+2][0]);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads, 3);
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=3; i<(numConcurrentThreads*3); i+=3){ //i is the index of matrix A, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){
|
||||
if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+2<<std::endl;
|
||||
std::cout<<"CBLAS DGEMM thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMM thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
101
cpp_thread_test/dgemv_thread_safety.cpp
Normal file
@@ -0,0 +1,101 @@
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <random>
|
||||
#include <future>
|
||||
#include <omp.h>
|
||||
#include "../cblas.h"
|
||||
#include "cpp_thread_safety_common.h"
|
||||
|
||||
void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){
|
||||
const blasint inc = 1;
|
||||
cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc);
|
||||
}
|
||||
|
||||
int main(int argc, char* argv[]){
|
||||
blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used
|
||||
uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested
|
||||
uint32_t numTestRounds = 16; //number of testing rounds before success exit
|
||||
|
||||
if (argc > 4){
|
||||
std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl;
|
||||
abort();
|
||||
}
|
||||
if(argc == 4){
|
||||
std::vector<std::string> cliArgs;
|
||||
for (int i = 1; i < argc; i++){
|
||||
cliArgs.push_back(argv[i]);
|
||||
std::cout<<argv[i]<<std::endl;
|
||||
}
|
||||
randomMatSize = std::stoul(cliArgs.at(0));
|
||||
numConcurrentThreads = std::stoul(cliArgs.at(1));
|
||||
numTestRounds = std::stoul(cliArgs.at(2));
|
||||
}
|
||||
|
||||
std::uniform_real_distribution<double> rngdist{-1.0, 1.0};
|
||||
std::vector<std::vector<double>> matBlock(numConcurrentThreads);
|
||||
std::vector<std::vector<double>> vecBlock(numConcurrentThreads*2);
|
||||
std::vector<std::future<void>> futureBlock(numConcurrentThreads);
|
||||
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"| DGEMV thread safety tester |\n";
|
||||
std::cout<<"*----------------------------*\n";
|
||||
std::cout<<"Size of random matrices and vectors(N=M): "<<randomMatSize<<'\n';
|
||||
std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n';
|
||||
std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n';
|
||||
std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl;
|
||||
|
||||
std::cout<<"Initializing random number generator..."<<std::flush;
|
||||
std::mt19937_64 PRNG = InitPRNG();
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Preparing to test CBLAS DGEMV thread safety\n";
|
||||
std::cout<<"Allocating matrices..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
matBlock.at(i).resize(randomMatSize*randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Allocating vectors..."<<std::flush;
|
||||
for(uint32_t i=0; i<(numConcurrentThreads*2); i++){
|
||||
vecBlock.at(i).resize(randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
//pauser();
|
||||
|
||||
std::cout<<"Filling matrices with random numbers..."<<std::flush;
|
||||
FillMatrices(matBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 1);
|
||||
//PrintMatrices(matBlock, randomMatSize, numConcurrentThreads);
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Filling vectors with random numbers..."<<std::flush;
|
||||
FillVectors(vecBlock, PRNG, rngdist, randomMatSize, numConcurrentThreads, 2);
|
||||
std::cout<<"done\n";
|
||||
|
||||
std::cout<<"Testing CBLAS DGEMV thread safety"<<std::endl;
|
||||
omp_set_num_threads(numConcurrentThreads);
|
||||
for(uint32_t R=0; R<numTestRounds; R++){
|
||||
std::cout<<"DGEMV round #"<<R<<std::endl;
|
||||
std::cout<<"Launching "<<numConcurrentThreads<<" threads simultaneously using OpenMP..."<<std::flush;
|
||||
#pragma omp parallel for default(none) shared(futureBlock, matBlock, vecBlock, randomMatSize, numConcurrentThreads)
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i] = std::async(std::launch::async, launch_cblas_dgemv, &matBlock[i][0], &vecBlock[i*2][0], &vecBlock[i*2+1][0], randomMatSize);
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Waiting for threads to finish..."<<std::flush;
|
||||
for(uint32_t i=0; i<numConcurrentThreads; i++){
|
||||
futureBlock[i].get();
|
||||
}
|
||||
std::cout<<"done\n";
|
||||
std::cout<<"Comparing results from different threads..."<<std::flush;
|
||||
for(uint32_t i=2; i<(numConcurrentThreads*2); i+=2){ //i is the index of vector x, for a given thread
|
||||
for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize); j++){
|
||||
if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread
|
||||
std::cout<<"ERROR: one of the threads returned a different result! Index : "<<i+1<<std::endl;
|
||||
std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::cout<<"OK!\n"<<std::endl;
|
||||
}
|
||||
std::cout<<"CBLAS DGEMV thread safety test PASSED!\n"<<std::endl;
|
||||
return 0;
|
||||
}
|
||||
@@ -94,7 +94,7 @@ int get_feature(char *search)
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
if (!strcmp(t, search)) { return(1); }
|
||||
}
|
||||
@@ -206,6 +206,33 @@ void get_subdirname(void)
|
||||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
@@ -309,6 +336,7 @@ void get_cpuconfig(void)
|
||||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
@@ -344,12 +372,10 @@ void get_features(void)
|
||||
if( p == NULL ) return;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
while( (t = strtok(NULL," ")))
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
45
cpuid_x86.c
45
cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
|
||||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
@@ -1211,7 +1215,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_CORE2;
|
||||
}
|
||||
break;
|
||||
case 1:
|
||||
case 1: // family 6 exmodel 1
|
||||
switch (model) {
|
||||
case 6:
|
||||
return CPUTYPE_CORE2;
|
||||
@@ -1228,7 +1232,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_DUNNINGTON;
|
||||
}
|
||||
break;
|
||||
case 2:
|
||||
case 2: // family 6 exmodel 2
|
||||
switch (model) {
|
||||
case 5:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
@@ -1257,7 +1261,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 3:
|
||||
case 3: // family 6 exmodel 3
|
||||
switch (model) {
|
||||
case 7:
|
||||
// Bay Trail
|
||||
@@ -1287,7 +1291,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 4:
|
||||
case 4: // family 6 exmodel 4
|
||||
switch (model) {
|
||||
case 5:
|
||||
case 6:
|
||||
@@ -1321,7 +1325,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 5:
|
||||
case 5: // family 6 exmodel 5
|
||||
switch (model) {
|
||||
case 6:
|
||||
//Broadwell
|
||||
@@ -1364,7 +1368,7 @@ int get_cpuname(void){
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 6:
|
||||
case 6: // family 6 exmodel 6
|
||||
switch (model) {
|
||||
case 6: // Cannon Lake
|
||||
if(support_avx512())
|
||||
@@ -1376,7 +1380,22 @@ int get_cpuname(void){
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
@@ -1412,7 +1431,11 @@ int get_cpuname(void){
|
||||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
@@ -1795,7 +1818,11 @@ int get_coretype(void){
|
||||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
@@ -2002,7 +2029,11 @@ int get_coretype(void){
|
||||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
|
||||
@@ -30,17 +30,20 @@
|
||||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
@@ -66,6 +69,8 @@ int detect(void)
|
||||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
||||
@@ -6,6 +6,8 @@ TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
override CFLAGS += -DADD$(BU) -DCBLAS
|
||||
override TARGET_ARCH=
|
||||
override TARGET_MACH=
|
||||
|
||||
LIB = $(TOPDIR)/$(LIBNAME)
|
||||
|
||||
|
||||
@@ -1503,6 +1503,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -1504,6 +1504,8 @@ C $ ' .' )
|
||||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
||||
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
||||
@@ -329,7 +329,7 @@ int support_avx512(){
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
@@ -585,9 +585,27 @@ static gotoblas_t *get_coretype(void){
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
|
||||
@@ -37,8 +37,10 @@
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
@@ -105,13 +107,17 @@ static gotoblas_t *force_coretype(char *coretype) {
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
|
||||
@@ -3,7 +3,9 @@
|
||||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
@@ -19,7 +21,9 @@ static char *corename[] = {
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
|
||||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
||||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
||||
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -312,7 +312,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
@@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
@@ -1855,7 +1855,7 @@ int get_num_procs(void) {
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
@@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
@@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
@@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
@@ -2041,8 +2041,12 @@ static BLASULONG alloc_lock = 0UL;
|
||||
|
||||
static void alloc_mmap_free(struct release_t *release){
|
||||
|
||||
if (!release->address) return;
|
||||
|
||||
if (munmap(release -> address, BUFFER_SIZE)) {
|
||||
printf("OpenBLAS : munmap failed\n");
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : munmap failed:");
|
||||
printf("error code=%d,\trelease->address=%lx\n",errsv,release->address);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2073,6 +2077,12 @@ static void *alloc_mmap(void *address){
|
||||
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
|
||||
UNLOCK_COMMAND(&alloc_lock);
|
||||
#endif
|
||||
} else {
|
||||
#ifdef DEBUG
|
||||
int errsv=errno;
|
||||
perror("OpenBLAS : mmap failed:");
|
||||
printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef OS_LINUX
|
||||
|
||||
@@ -38,21 +38,29 @@
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
||||
@@ -78,10 +78,10 @@ char tmpstr[20];
|
||||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
||||
897
dynamic.c
Normal file
897
dynamic.c
Normal file
@@ -0,0 +1,897 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define strncasecmp _strnicmp
|
||||
#define strcasecmp _stricmp
|
||||
#endif
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#define EXTERN extern
|
||||
#else
|
||||
#define EXTERN
|
||||
#endif
|
||||
|
||||
#ifdef DYNAMIC_LIST
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
|
||||
#ifdef DYN_ATHLON
|
||||
extern gotoblas_t gotoblas_ATHLON;
|
||||
#else
|
||||
#define gotoblas_ATHLON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_KATMAI
|
||||
extern gotoblas_t gotoblas_KATMAI;
|
||||
#else
|
||||
#define gotoblas_KATMAI gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BANIAS
|
||||
extern gotoblas_t gotoblas_BANIAS;
|
||||
#else
|
||||
#define gotoblas_BANIAS gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_COPPERMINE
|
||||
extern gotoblas_t gotoblas_COPPERMINE;
|
||||
#else
|
||||
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NORTHWOOD
|
||||
extern gotoblas_t gotoblas_NORTHWOOD;
|
||||
#else
|
||||
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_CORE2
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
#else
|
||||
#define gotoblas_CORE2 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NEHALEM
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
#else
|
||||
#define gotoblas_NEHALEM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BARCELONA
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BARCELONA gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BARCELONA gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ATOM
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_NANO
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
#else
|
||||
#define gotoblas_NANO gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PENRYN
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
#else
|
||||
#define gotoblas_PENRYN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_DUNNINGTON
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
#else
|
||||
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
#else
|
||||
#define gotoblas_OPTERON gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_OPTERON_SSE3
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
#else
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BOBCAT
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BOBCAT gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BOBCAT gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SANDYBRIDGE
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_BULLDOZER
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_BULLDOZER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_PILEDRIVER
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_STEAMROLLER
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_EXCAVATOR
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_HASWELL
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_HASWELL gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_ZEN
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_ZEN gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_ZEN gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_ZEN gotoblas_PRESCOTT
|
||||
#endif
|
||||
#ifdef DYN_SKYLAKEX
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#elif defined(DYN_HASWELL)
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#elif defined(DYN_SANDYBRIDGE)
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#elif defined(DYN_NEHALEM)
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
|
||||
#endif
|
||||
|
||||
|
||||
#else // not DYNAMIC_LIST
|
||||
EXTERN gotoblas_t gotoblas_KATMAI;
|
||||
EXTERN gotoblas_t gotoblas_COPPERMINE;
|
||||
EXTERN gotoblas_t gotoblas_NORTHWOOD;
|
||||
EXTERN gotoblas_t gotoblas_BANIAS;
|
||||
EXTERN gotoblas_t gotoblas_ATHLON;
|
||||
|
||||
extern gotoblas_t gotoblas_PRESCOTT;
|
||||
extern gotoblas_t gotoblas_CORE2;
|
||||
extern gotoblas_t gotoblas_NEHALEM;
|
||||
extern gotoblas_t gotoblas_BARCELONA;
|
||||
#ifdef DYNAMIC_OLDER
|
||||
extern gotoblas_t gotoblas_ATOM;
|
||||
extern gotoblas_t gotoblas_NANO;
|
||||
extern gotoblas_t gotoblas_PENRYN;
|
||||
extern gotoblas_t gotoblas_DUNNINGTON;
|
||||
extern gotoblas_t gotoblas_OPTERON;
|
||||
extern gotoblas_t gotoblas_OPTERON_SSE3;
|
||||
extern gotoblas_t gotoblas_BOBCAT;
|
||||
#else
|
||||
#define gotoblas_ATOM gotoblas_NEHALEM
|
||||
#define gotoblas_NANO gotoblas_NEHALEM
|
||||
#define gotoblas_PENRYN gotoblas_CORE2
|
||||
#define gotoblas_DUNNINGTON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON gotoblas_CORE2
|
||||
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
|
||||
#define gotoblas_BOBCAT gotoblas_CORE2
|
||||
#endif
|
||||
|
||||
#ifndef NO_AVX
|
||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||
extern gotoblas_t gotoblas_BULLDOZER;
|
||||
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||
extern gotoblas_t gotoblas_STEAMROLLER;
|
||||
extern gotoblas_t gotoblas_EXCAVATOR;
|
||||
#ifdef NO_AVX2
|
||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
|
||||
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
|
||||
#else
|
||||
extern gotoblas_t gotoblas_HASWELL;
|
||||
extern gotoblas_t gotoblas_ZEN;
|
||||
#ifndef NO_AVX512
|
||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||
#else
|
||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||
#endif
|
||||
#endif
|
||||
#else
|
||||
//Use NEHALEM kernels for sandy bridge
|
||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
|
||||
#define gotoblas_ZEN gotoblas_BARCELONA
|
||||
#endif
|
||||
|
||||
#endif // DYNAMIC_LIST
|
||||
|
||||
#define VENDOR_INTEL 1
|
||||
#define VENDOR_AMD 2
|
||||
#define VENDOR_CENTAUR 3
|
||||
#define VENDOR_HYGON 4
|
||||
#define VENDOR_UNKNOWN 99
|
||||
|
||||
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
|
||||
|
||||
#ifndef NO_AVX
|
||||
static inline void xgetbv(int op, int * eax, int * edx){
|
||||
//Use binary code for xgetbv
|
||||
__asm__ __volatile__
|
||||
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
|
||||
}
|
||||
#endif
|
||||
|
||||
int support_avx(){
|
||||
#ifndef NO_AVX
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 6) == 6){
|
||||
ret=1; //OS support AVX
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx2(){
|
||||
#ifndef NO_AVX2
|
||||
int eax, ebx, ecx=0, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 0)
|
||||
ret=1; //OS supports AVX2
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int support_avx512(){
|
||||
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||
int eax, ebx, ecx, edx;
|
||||
int ret=0;
|
||||
|
||||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
xgetbv(0, &eax, &edx);
|
||||
if((eax & 0xe0) == 0xe0)
|
||||
ret=1; //OS supports AVX512VL
|
||||
}
|
||||
return ret;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
|
||||
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
|
||||
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
static int get_vendor(void){
|
||||
int eax, ebx, ecx, edx;
|
||||
|
||||
union
|
||||
{
|
||||
char vchar[16];
|
||||
int vint[4];
|
||||
} vendor;
|
||||
|
||||
cpuid(0, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
*(&vendor.vint[0]) = ebx;
|
||||
*(&vendor.vint[1]) = edx;
|
||||
*(&vendor.vint[2]) = ecx;
|
||||
|
||||
vendor.vchar[12] = '\0';
|
||||
|
||||
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
|
||||
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
|
||||
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
|
||||
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
|
||||
|
||||
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
|
||||
|
||||
return VENDOR_UNKNOWN;
|
||||
}
|
||||
|
||||
static gotoblas_t *get_coretype(void){
|
||||
|
||||
int eax, ebx, ecx, edx;
|
||||
int family, exfamily, model, vendor, exmodel;
|
||||
|
||||
cpuid(1, &eax, &ebx, &ecx, &edx);
|
||||
|
||||
family = BITMASK(eax, 8, 0x0f);
|
||||
exfamily = BITMASK(eax, 20, 0xff);
|
||||
model = BITMASK(eax, 4, 0x0f);
|
||||
exmodel = BITMASK(eax, 16, 0x0f);
|
||||
|
||||
vendor = get_vendor();
|
||||
|
||||
if (vendor == VENDOR_INTEL){
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
switch (exmodel) {
|
||||
case 0:
|
||||
if (model <= 0x7) return &gotoblas_KATMAI;
|
||||
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
|
||||
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
|
||||
if (model == 14) return &gotoblas_BANIAS;
|
||||
if (model == 15) return &gotoblas_CORE2;
|
||||
return NULL;
|
||||
|
||||
case 1:
|
||||
if (model == 6) return &gotoblas_CORE2;
|
||||
if (model == 7) return &gotoblas_PENRYN;
|
||||
if (model == 13) return &gotoblas_DUNNINGTON;
|
||||
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
|
||||
if (model == 12) return &gotoblas_ATOM;
|
||||
return NULL;
|
||||
|
||||
case 2:
|
||||
//Intel Core (Clarkdale) / Core (Arrandale)
|
||||
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
|
||||
// Xeon (Clarkdale), 32nm
|
||||
if (model == 5) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Xeon Processor 5600 (Westmere-EP)
|
||||
//Xeon Processor E7 (Westmere-EX)
|
||||
//Xeon E7540
|
||||
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
|
||||
|
||||
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
|
||||
//Intel Core i7-3000 / Xeon E5
|
||||
if (model == 10 || model == 13) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 3:
|
||||
//Intel Sandy Bridge 22nm (Ivy Bridge?)
|
||||
if (model == 10 || model == 14) {
|
||||
if(support_avx())
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Haswell
|
||||
if (model == 12 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 13) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 7) return &gotoblas_ATOM; //Bay Trail
|
||||
return NULL;
|
||||
case 4:
|
||||
//Intel Haswell
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Broadwell
|
||||
if (model == 7 || model == 15) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Braswell / Avoton
|
||||
if (model == 12 || model == 13) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 5:
|
||||
//Intel Broadwell
|
||||
if (model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
if (model == 5) {
|
||||
// Intel Skylake X
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
//Intel Skylake
|
||||
if (model == 14) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Intel Phi Knights Landing
|
||||
if (model == 7) {
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
//Apollo Lake or Denverton
|
||||
if (model == 12 || model == 15) {
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
return NULL;
|
||||
case 6:
|
||||
if (model == 6) {
|
||||
// Cannon Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
if (model <= 0x2) return &gotoblas_NORTHWOOD;
|
||||
return &gotoblas_PRESCOTT;
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
|
||||
if (family <= 0xe) {
|
||||
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
|
||||
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
|
||||
if ( (eax & 0xffff) >= 0x01) {
|
||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
|
||||
return NULL;
|
||||
}
|
||||
else
|
||||
return NULL;
|
||||
|
||||
return &gotoblas_ATHLON;
|
||||
}
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) {
|
||||
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
|
||||
else return &gotoblas_OPTERON;
|
||||
} else if (exfamily == 5) {
|
||||
return &gotoblas_BOBCAT;
|
||||
} else if (exfamily == 6) {
|
||||
if(model == 1){
|
||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||
if(support_avx())
|
||||
return &gotoblas_BULLDOZER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 2 || model == 3){
|
||||
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 5){
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if(model == 0 || model == 8){
|
||||
if (exmodel == 1) {
|
||||
//AMD Trinity
|
||||
if(support_avx())
|
||||
return &gotoblas_PILEDRIVER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 3) {
|
||||
//AMD STEAMROLLER
|
||||
if(support_avx())
|
||||
return &gotoblas_STEAMROLLER;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else if (exmodel == 6) {
|
||||
if(support_avx())
|
||||
return &gotoblas_EXCAVATOR;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 8) {
|
||||
if (model == 1 || model == 8) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
} else if (exfamily == 9) {
|
||||
if(support_avx())
|
||||
return &gotoblas_ZEN;
|
||||
else{
|
||||
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
|
||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}else {
|
||||
return &gotoblas_BARCELONA;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (vendor == VENDOR_CENTAUR) {
|
||||
switch (family) {
|
||||
case 0x6:
|
||||
return &gotoblas_NANO;
|
||||
}
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static char *corename[] = {
|
||||
"Unknown",
|
||||
"Katmai",
|
||||
"Coppermine",
|
||||
"Northwood",
|
||||
"Prescott",
|
||||
"Banias",
|
||||
"Atom",
|
||||
"Core2",
|
||||
"Penryn",
|
||||
"Dunnington",
|
||||
"Nehalem",
|
||||
"Athlon",
|
||||
"Opteron",
|
||||
"Opteron_SSE3",
|
||||
"Barcelona",
|
||||
"Nano",
|
||||
"Sandybridge",
|
||||
"Bobcat",
|
||||
"Bulldozer",
|
||||
"Piledriver",
|
||||
"Haswell",
|
||||
"Steamroller",
|
||||
"Excavator",
|
||||
"Zen",
|
||||
"SkylakeX"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
|
||||
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
|
||||
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
|
||||
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
|
||||
if (gotoblas == &gotoblas_ATHLON) return corename[11];
|
||||
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
|
||||
if (gotoblas == &gotoblas_OPTERON) return corename[13];
|
||||
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
|
||||
if (gotoblas == &gotoblas_NANO) return corename[15];
|
||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||
if (gotoblas == &gotoblas_HASWELL) return corename[20];
|
||||
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
|
||||
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
|
||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
|
||||
static gotoblas_t *force_coretype(char *coretype){
|
||||
|
||||
int i ;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
//char mname[20];
|
||||
|
||||
for ( i=1 ; i <= 24; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype,corename[i],20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found < 0)
|
||||
{
|
||||
//strncpy(mname,coretype,20);
|
||||
snprintf(message, 128, "Core not found: %s\n",coretype);
|
||||
openblas_warning(1, message);
|
||||
return(NULL);
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 24: return (&gotoblas_SKYLAKEX);
|
||||
case 23: return (&gotoblas_ZEN);
|
||||
case 22: return (&gotoblas_EXCAVATOR);
|
||||
case 21: return (&gotoblas_STEAMROLLER);
|
||||
case 20: return (&gotoblas_HASWELL);
|
||||
case 19: return (&gotoblas_PILEDRIVER);
|
||||
case 18: return (&gotoblas_BULLDOZER);
|
||||
case 17: return (&gotoblas_BOBCAT);
|
||||
case 16: return (&gotoblas_SANDYBRIDGE);
|
||||
case 15: return (&gotoblas_NANO);
|
||||
case 14: return (&gotoblas_BARCELONA);
|
||||
case 13: return (&gotoblas_OPTERON);
|
||||
case 12: return (&gotoblas_OPTERON_SSE3);
|
||||
case 11: return (&gotoblas_ATHLON);
|
||||
case 10: return (&gotoblas_NEHALEM);
|
||||
case 9: return (&gotoblas_DUNNINGTON);
|
||||
case 8: return (&gotoblas_PENRYN);
|
||||
case 7: return (&gotoblas_CORE2);
|
||||
case 6: return (&gotoblas_ATOM);
|
||||
case 5: return (&gotoblas_BANIAS);
|
||||
case 4: return (&gotoblas_PRESCOTT);
|
||||
case 3: return (&gotoblas_NORTHWOOD);
|
||||
case 2: return (&gotoblas_COPPERMINE);
|
||||
case 1: return (&gotoblas_KATMAI);
|
||||
}
|
||||
return(NULL);
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char *p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if ( p )
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
#ifdef ARCH_X86
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
|
||||
#else
|
||||
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
|
||||
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
|
||||
if (sizeof(void*) == 8) {
|
||||
if (gotoblas == &gotoblas_KATMAI ||
|
||||
gotoblas == &gotoblas_COPPERMINE ||
|
||||
gotoblas == &gotoblas_NORTHWOOD ||
|
||||
gotoblas == &gotoblas_BANIAS ||
|
||||
gotoblas == &gotoblas_ATHLON)
|
||||
gotoblas = &gotoblas_PRESCOTT;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (gotoblas && gotoblas -> init) {
|
||||
strncpy(coren,gotoblas_corename(),20);
|
||||
sprintf(coremsg, "Core: %s\n",coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas -> init();
|
||||
} else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
|
||||
gotoblas = NULL;
|
||||
|
||||
}
|
||||
@@ -105,6 +105,10 @@ $(LIBPREFIX).def : gensymbol
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
endif
|
||||
|
||||
ifeq (, $(SYMBOLPREFIX)$(SYMBOLSUFFIX))
|
||||
$(LIBDYNNAME) : ../$(LIBNAME) osx.def
|
||||
else
|
||||
@@ -114,9 +118,9 @@ $(LIBDYNNAME) : ../$(LIBNAME).osx.renamed osx.def
|
||||
endif
|
||||
ifneq (,$(filter 1 2,$(NOFORTRAN)))
|
||||
#only build without Fortran
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
else
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(LIBDYNNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
$(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB)
|
||||
endif
|
||||
|
||||
dllinit.$(SUFFIX) : dllinit.c
|
||||
|
||||
@@ -618,19 +618,6 @@
|
||||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
@@ -647,33 +634,8 @@
|
||||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
@@ -690,45 +652,8 @@
|
||||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
@@ -745,45 +670,8 @@
|
||||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
@@ -800,43 +688,13 @@
|
||||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
|
||||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
ilaenv2stage
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
@@ -3509,6 +3367,59 @@
|
||||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage
|
||||
);
|
||||
|
||||
|
||||
|
||||
9
f_check
9
f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
|
||||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
@@ -130,6 +130,11 @@ if ($compiler eq "") {
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
@@ -277,6 +282,8 @@ $linker_a = "";
|
||||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
|
||||
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
@@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
@@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
|
||||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
||||
@@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
||||
SLAPACKOBJS = \
|
||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
|
||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#DLAPACKOBJS = \
|
||||
@@ -405,14 +405,14 @@ SLAPACKOBJS = \
|
||||
DLAPACKOBJS = \
|
||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
|
||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
QLAPACKOBJS = \
|
||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
|
||||
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
qtrtrs.$(SUFFIX)
|
||||
|
||||
#CLAPACKOBJS = \
|
||||
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
@@ -423,7 +423,7 @@ QLAPACKOBJS = \
|
||||
CLAPACKOBJS = \
|
||||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#ZLAPACKOBJS = \
|
||||
@@ -435,13 +435,14 @@ CLAPACKOBJS = \
|
||||
ZLAPACKOBJS = \
|
||||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
|
||||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
XLAPACKOBJS = \
|
||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xtrtrs.$(SUFFIX)
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
SBLASOBJS += $(SLAPACKOBJS)
|
||||
@@ -2031,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
@@ -2040,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
|
||||
|
||||
171
interface/lapack/trtrs.c
Normal file
171
interface/lapack/trtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "STRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
171
interface/lapack/ztrtrs.c
Normal file
171
interface/lapack/ztrtrs.c
Normal file
@@ -0,0 +1,171 @@
|
||||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "CTRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_URU_SINGLE, TRTRS_URN_SINGLE, TRTRS_UCU_SINGLE, TRTRS_UCN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, TRTRS_LRU_SINGLE, TRTRS_LRN_SINGLE, TRTRS_LCU_SINGLE, TRTRS_LCN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_URU_PARALLEL, TRTRS_URN_PARALLEL, TRTRS_UCU_PARALLEL, TRTRS_UCN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, TRTRS_LRU_PARALLEL, TRTRS_LRN_PARALLEL, TRTRS_LCU_PARALLEL, TRTRS_LCN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 2;
|
||||
if (trans_arg == 'C') trans = 3;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
||||
@@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), GENERIC)
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
|
||||
@@ -1,30 +1,30 @@
|
||||
include $(KERNELDIR)/KERNEL.ARMV5
|
||||
|
||||
SAMAXKERNEL = iamax_vfp.S
|
||||
DAMAXKERNEL = iamax_vfp.S
|
||||
CAMAXKERNEL = iamax_vfp.S
|
||||
ZAMAXKERNEL = iamax_vfp.S
|
||||
SAMAXKERNEL = amax_vfp.S
|
||||
DAMAXKERNEL = amax_vfp.S
|
||||
#CAMAXKERNEL = amax_vfp.S
|
||||
#ZAMAXKERNEL = amax_vfp.S
|
||||
|
||||
SAMINKERNEL = iamax_vfp.S
|
||||
DAMINKERNEL = iamax_vfp.S
|
||||
CAMINKERNEL = iamax_vfp.S
|
||||
ZAMINKERNEL = iamax_vfp.S
|
||||
SAMINKERNEL = amax_vfp.S
|
||||
DAMINKERNEL = amax_vfp.S
|
||||
#CAMINKERNEL = amax_vfp.S
|
||||
#ZAMINKERNEL = amax_vfp.S
|
||||
|
||||
SMAXKERNEL = iamax_vfp.S
|
||||
DMAXKERNEL = iamax_vfp.S
|
||||
SMAXKERNEL = amax_vfp.S
|
||||
DMAXKERNEL = amax_vfp.S
|
||||
|
||||
SMINKERNEL = iamax_vfp.S
|
||||
DMINKERNEL = iamax_vfp.S
|
||||
SMINKERNEL = amax_vfp.S
|
||||
DMINKERNEL = amax_vfp.S
|
||||
|
||||
ISAMAXKERNEL = iamax_vfp.S
|
||||
IDAMAXKERNEL = iamax_vfp.S
|
||||
ICAMAXKERNEL = iamax_vfp.S
|
||||
IZAMAXKERNEL = iamax_vfp.S
|
||||
#ICAMAXKERNEL = iamax_vfp.S
|
||||
#IZAMAXKERNEL = iamax_vfp.S
|
||||
|
||||
ISAMINKERNEL = iamax_vfp.S
|
||||
IDAMINKERNEL = iamax_vfp.S
|
||||
ICAMINKERNEL = iamax_vfp.S
|
||||
IZAMINKERNEL = iamax_vfp.S
|
||||
#ICAMINKERNEL = iamax_vfp.S
|
||||
#IZAMINKERNEL = iamax_vfp.S
|
||||
|
||||
ISMAXKERNEL = iamax_vfp.S
|
||||
IDMAXKERNEL = iamax_vfp.S
|
||||
|
||||
445
kernel/arm/amax_vfp.S
Normal file
445
kernel/arm/amax_vfp.S
Normal file
@@ -0,0 +1,445 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* 2013/11/14 Saar
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
**************************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define STACKSIZE 256
|
||||
|
||||
#define N r0
|
||||
#define X r1
|
||||
#define INC_X r2
|
||||
|
||||
#define I r12
|
||||
|
||||
#define X_PRE 512
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
#if defined(USE_ABS)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VABS(x0,x1) vabs.f64 x0, x1
|
||||
|
||||
#else
|
||||
|
||||
#define VABS(x0,x1) vabs.f32 x0, x1
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define VABS(x0,x1) nop
|
||||
|
||||
#endif
|
||||
|
||||
/*****************************************************************************************/
|
||||
|
||||
#if defined(USE_MIN)
|
||||
|
||||
#define MOVCOND movlt
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VMOVCOND vmovlt.f64
|
||||
|
||||
#else
|
||||
|
||||
#define VMOVCOND vmovlt.f32
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#define MOVCOND movgt
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
#define VMOVCOND vmovgt.f64
|
||||
|
||||
#else
|
||||
|
||||
#define VMOVCOND vmovgt.f32
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/*****************************************************************************************/
|
||||
|
||||
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
vldmia.f64 X!, { d0 }
|
||||
VABS( d0, d0 )
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 }
|
||||
VABS( d4, d4 )
|
||||
vcmpe.f64 d4, d0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND d0, d4
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
vldmia.f64 X, { d0 }
|
||||
VABS( d0, d0 )
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 }
|
||||
VABS( d4, d4 )
|
||||
vcmpe.f64 d4, d0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
vldmia.f32 X!, { s0 }
|
||||
VABS( s0, s0 )
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 }
|
||||
VABS( s4, s4 )
|
||||
vcmpe.f32 s4, s0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
vldmia.f32 X, { s0 }
|
||||
VABS( s0, s0 )
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 }
|
||||
VABS( s4, s4 )
|
||||
vcmpe.f32 s4, s0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
vldmia.f64 X!, { d0 -d1 }
|
||||
vabs.f64 d0, d0
|
||||
vabs.f64 d1, d1
|
||||
vadd.f64 d0 , d0, d1
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f64 X!, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d4 , d4, d5
|
||||
vcmpe.f64 d4, d0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND d0, d4
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
vldmia.f64 X, { d0 -d1 }
|
||||
vabs.f64 d0, d0
|
||||
vabs.f64 d1, d1
|
||||
vadd.f64 d0 , d0, d1
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f64 X, { d4 - d5 }
|
||||
vabs.f64 d4, d4
|
||||
vabs.f64 d5, d5
|
||||
vadd.f64 d4 , d4, d5
|
||||
vcmpe.f64 d4, d0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND d0, d4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
#else
|
||||
|
||||
.macro INIT_F
|
||||
|
||||
vldmia.f32 X!, { s0 -s1 }
|
||||
vabs.f32 s0, s0
|
||||
vabs.f32 s1, s1
|
||||
vadd.f32 s0 , s0, s1
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
vldmia.f32 X!, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s4 , s4, s5
|
||||
vcmpe.f32 s4, s0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND s0, s4
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
vldmia.f32 X, { s0 -s1 }
|
||||
vabs.f32 s0, s0
|
||||
vabs.f32 s1, s1
|
||||
vadd.f32 s0 , s0, s1
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
vldmia.f32 X, { s4 - s5 }
|
||||
vabs.f32 s4, s4
|
||||
vabs.f32 s5, s5
|
||||
vadd.f32 s4 , s4, s5
|
||||
vcmpe.f32 s4, s0
|
||||
vmrs APSR_nzcv, fpscr
|
||||
VMOVCOND s0, s4
|
||||
add X, X, INC_X
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
movs r12, #0 // clear floating point register
|
||||
vmov s0, r12
|
||||
#if defined(DOUBLE)
|
||||
vcvt.f64.f32 d0, s0
|
||||
#endif
|
||||
|
||||
|
||||
cmp N, #0
|
||||
ble amax_kernel_L999
|
||||
|
||||
cmp INC_X, #0
|
||||
beq amax_kernel_L999
|
||||
|
||||
|
||||
cmp INC_X, #1
|
||||
bne amax_kernel_S_BEGIN
|
||||
|
||||
|
||||
amax_kernel_F_BEGIN:
|
||||
|
||||
INIT_F
|
||||
|
||||
subs N, N , #1
|
||||
ble amax_kernel_L999
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble amax_kernel_F1
|
||||
|
||||
.align 5
|
||||
|
||||
amax_kernel_F4:
|
||||
|
||||
pld [ X, #X_PRE ]
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
#if defined(COMPLEX) && defined(DOUBLE)
|
||||
pld [ X, #X_PRE ]
|
||||
#endif
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
ble amax_kernel_F1
|
||||
|
||||
|
||||
#if defined(COMPLEX) || defined(DOUBLE)
|
||||
pld [ X, #X_PRE ]
|
||||
#endif
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
#if defined(COMPLEX) && defined(DOUBLE)
|
||||
pld [ X, #X_PRE ]
|
||||
#endif
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F4
|
||||
|
||||
amax_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F10
|
||||
|
||||
b amax_kernel_L999
|
||||
|
||||
amax_kernel_S_BEGIN:
|
||||
|
||||
#if defined(COMPLEX)
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #4 // INC_X * SIZE * 2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE * 2
|
||||
#endif
|
||||
|
||||
#else
|
||||
|
||||
#if defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3 // INC_X * SIZE
|
||||
#else
|
||||
lsl INC_X, INC_X, #2 // INC_X * SIZE
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N , #1
|
||||
ble amax_kernel_L999
|
||||
|
||||
asrs I, N, #2 // I = N / 4
|
||||
ble amax_kernel_S1
|
||||
|
||||
.align 5
|
||||
|
||||
amax_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S4
|
||||
|
||||
amax_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S10
|
||||
|
||||
|
||||
amax_kernel_L999:
|
||||
#if !defined(__ARM_PCS_VFP)
|
||||
#if defined(DOUBLE)
|
||||
vmov r0, r1, d0
|
||||
#else
|
||||
vmov r0, s0
|
||||
#endif
|
||||
#endif
|
||||
bx lr
|
||||
|
||||
EPILOGUE
|
||||
|
||||
@@ -91,12 +91,10 @@ IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
@@ -104,38 +102,6 @@ CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(OS_DARWIN)$(CROSS),11)
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
@@ -202,5 +168,3 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
||||
@@ -54,37 +54,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
@@ -54,138 +54,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr s5, [X], #4
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr d5, [X], #8
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_END_\@:
|
||||
4: /* KERNEL_F1_END_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X]
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr s5, [X, #4]
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X]
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr d5, [X, #8]
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_S1_END_\@:
|
||||
KERNEL_S1_END:
|
||||
add X, X, INC_X
|
||||
.endm
|
||||
|
||||
|
||||
@@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMAXKERNEL = isamax_power8.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMAXKERNEL = icamax_power8.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMINKERNEL = isamin_power8.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMINKERNEL = icamin_power8.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
@@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
|
||||
@@ -5,8 +5,8 @@
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power9.S
|
||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
|
||||
CTRMMKERNEL = cgemm_kernel_power9.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power9.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power9.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
@@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
||||
CGEMMKERNEL = cgemm_kernel_power9.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
@@ -38,7 +38,7 @@ CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
|
||||
ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ISAMAXKERNEL = isamax.c
|
||||
ISAMAXKERNEL = isamax_power9.S
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
ICAMAXKERNEL = icamax_power9.S
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ISAMINKERNEL = isamin.c
|
||||
ISAMINKERNEL = isamin_power9.S
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
ICAMINKERNEL = icamin_power9.S
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
@@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
CAXPYKERNEL = caxpy_power9.S
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
@@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
|
||||
@@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
|
||||
|
||||
SAXPYKERNEL = axpy_ppc440.S
|
||||
DAXPYKERNEL = axpy_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
else
|
||||
CAXPYKERNEL = zaxpy_ppc440.S
|
||||
ZAXPYKERNEL = zaxpy_ppc440.S
|
||||
endif
|
||||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
@@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
||||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
else
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
@@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
endif
|
||||
|
||||
|
||||
@@ -1,3 +1,14 @@
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMMKERNEL = gemm_kernel.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
SGEMMKERNEL = gemm_kernel_altivec.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
@@ -7,6 +18,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DGEMMKERNEL = gemm_kernel.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
@@ -16,6 +29,18 @@ DGEMMINCOPYOBJ =
|
||||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMITCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
CGEMMKERNEL = zgemm_kernel_altivec.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
@@ -25,6 +50,8 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
@@ -35,22 +62,30 @@ ZGEMMITCOPYOBJ =
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
#STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
#STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
#CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
#CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define N r3
|
||||
#define X r6
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define N r3
|
||||
#define X r6
|
||||
|
||||
@@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#ifndef HAVE_ASM_KERNEL
|
||||
#include <altivec.h>
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
#define offset_4 64
|
||||
#define offset_5 80
|
||||
#define offset_6 96
|
||||
#define offset_7 112
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
|
||||
@@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
|
||||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_x = (__vector float *) x;
|
||||
BLASLONG i=0;
|
||||
for (; i < n/2; i += 8) {
|
||||
for(;i<n/2;i+=8){
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vy_4 = vy[i + 4];
|
||||
register __vector float vy_5 = vy[i + 5];
|
||||
register __vector float vy_6 = vy[i + 6];
|
||||
register __vector float vy_7 = vy[i + 7];
|
||||
register __vector float vx_0 = vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
register __vector float vx_4 = vx[i + 4];
|
||||
register __vector float vx_5 = vx[i + 5];
|
||||
register __vector float vx_6 = vx[i + 6];
|
||||
register __vector float vx_7 = vx[i + 7];
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
|
||||
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
|
||||
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
|
||||
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
|
||||
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
|
||||
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
|
||||
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
|
||||
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
|
||||
vy_0 += vx_0*valpha_r;
|
||||
vy_1 += vx_1*valpha_r;
|
||||
vy_2 += vx_2*valpha_r;
|
||||
@@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
|
||||
vy_5 += vx_5*valpha_i;
|
||||
vy_6 += vx_6*valpha_i;
|
||||
vy_7 += vx_7*valpha_i;
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
vy[i + 4] = vy_4;
|
||||
vy[i + 5] = vy_5 ;
|
||||
vy[i + 6] = vy_6 ;
|
||||
vy[i + 7] = vy_7 ;
|
||||
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
|
||||
|
||||
vptr_x+=8;
|
||||
vptr_y+=8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
582
kernel/power/caxpy_power8.S
Normal file
582
kernel/power/caxpy_power8.S
Normal file
@@ -0,0 +1,582 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L41
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L41:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
std 31,-8(1)
|
||||
li 11,0
|
||||
bne 0,.L42
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 3,8,9
|
||||
add 6,8,6
|
||||
subfc 5,5,3
|
||||
add 9,10,9
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 31,31,31
|
||||
addi 6,5,1
|
||||
addi 5,31,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 31,7,4
|
||||
adde 6,5,6
|
||||
subfic 31,0,3
|
||||
subfe 31,31,31
|
||||
xori 6,6,0x1
|
||||
neg 31,31
|
||||
and 6,6,31
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L43
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 31,31,31
|
||||
addi 0,31,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 31,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
addi 0,6,-1
|
||||
addis 31,2,.LC3@toc@ha
|
||||
std 30,-16(1)
|
||||
xscvdpspn 12,1
|
||||
xscvdpspn 11,2
|
||||
srdi. 30,0,2
|
||||
addis 6,2,.LC2@toc@ha
|
||||
addi 6,6,.LC2@toc@l
|
||||
mtctr 30
|
||||
addi 31,31,.LC3@toc@l
|
||||
lxvd2x 42,0,6
|
||||
li 5,16
|
||||
li 6,0
|
||||
lxvd2x 41,0,31
|
||||
xxspltw 12,12,0
|
||||
xxspltw 11,11,0
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 41,41,41,2
|
||||
beq 0,.L44
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvd2x 44,3,6
|
||||
lxvd2x 45,3,5
|
||||
lxvd2x 33,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 32,33,33,2
|
||||
xxpermdi 33,0,0,2
|
||||
vperm 11,13,12,10
|
||||
vperm 13,13,12,9
|
||||
vperm 12,1,0,10
|
||||
vperm 1,1,0,9
|
||||
xvmulsp 0,11,43
|
||||
xvmulsp 32,11,45
|
||||
xvmsubmsp 45,12,0
|
||||
xvmaddasp 32,12,43
|
||||
xvaddsp 44,32,44
|
||||
xvsubsp 32,33,45
|
||||
vmrglw 1,0,12
|
||||
vmrghw 0,0,12
|
||||
#else
|
||||
lxvd2x 45,3,6
|
||||
lxvd2x 33,3,5
|
||||
lxvd2x 43,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,43,43,2
|
||||
xxpermdi 43,0,0,2
|
||||
vperm 12,1,13,10
|
||||
vperm 1,1,13,9
|
||||
vperm 13,11,0,10
|
||||
vperm 11,11,0,9
|
||||
xvmulsp 0,11,44
|
||||
xvmulsp 32,11,33
|
||||
xvmaddmsp 33,12,0
|
||||
xvmsubasp 32,12,44
|
||||
xvaddsp 45,32,45
|
||||
xvaddsp 32,33,43
|
||||
vmrglw 1,0,13
|
||||
vmrghw 0,0,13
|
||||
#endif
|
||||
xxpermdi 0,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
stxvd2x 0,9,6
|
||||
addi 6,6,32
|
||||
stxvd2x 32,9,5
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 30,-16(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 6,11,2
|
||||
addi 9,4,1
|
||||
addi 5,6,4
|
||||
cmpd 7,7,9
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,2
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
addi 6,4,2
|
||||
addi 5,9,4
|
||||
cmpd 7,7,6
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
addi 6,11,4
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 5,6,4
|
||||
cmpd 7,7,4
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
ld 31,-8(1)
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L43:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
cmpd 7,10,7
|
||||
subf 10,4,7
|
||||
mtctr 10
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
addi 9,9,8
|
||||
addi 3,3,8
|
||||
lfs 12,-8(9)
|
||||
lfs 0,-4(9)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
.L39:
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L42:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 32,1,1,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,2
|
||||
xvcvdpsp 32,32
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 39,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 39,39
|
||||
#else
|
||||
fneg 0,2
|
||||
xxpermdi 39,2,2,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,1
|
||||
xvcvdpsp 39,39
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 32,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 32,32
|
||||
#endif
|
||||
xxpermdi 12,12,12,2
|
||||
vmrgew 7,7,0
|
||||
beq 0,.L5
|
||||
xxlnor 38,12,12
|
||||
std 29,-24(1)
|
||||
std 30,-16(1)
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 29,0
|
||||
li 30,16
|
||||
li 31,32
|
||||
li 12,48
|
||||
li 0,64
|
||||
li 11,80
|
||||
li 3,96
|
||||
li 5,112
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxvd2x 6,0,9
|
||||
lxvd2x 40,0,6
|
||||
addi 29,29,8
|
||||
lxvd2x 41,6,30
|
||||
lxvd2x 42,6,31
|
||||
cmpd 7,28,29
|
||||
lxvd2x 43,6,12
|
||||
lxvd2x 44,6,0
|
||||
lxvd2x 45,6,11
|
||||
lxvd2x 33,6,3
|
||||
lxvd2x 32,6,5
|
||||
lxvd2x 7,9,30
|
||||
addi 6,6,128
|
||||
lxvd2x 8,9,31
|
||||
lxvd2x 9,9,12
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 6,6,6,2
|
||||
lxvd2x 10,9,0
|
||||
lxvd2x 11,9,11
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 42,42,42,2
|
||||
lxvd2x 12,9,3
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 10,10,10,2
|
||||
xxpermdi 11,11,11,2
|
||||
xxpermdi 12,12,12,2
|
||||
xxpermdi 0,0,0,2
|
||||
#ifndef CONJ
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
#endif
|
||||
xvmaddasp 6,39,40
|
||||
xvmaddasp 7,39,41
|
||||
xvmaddasp 8,39,42
|
||||
xvmaddasp 9,39,43
|
||||
xvmaddasp 10,39,44
|
||||
xvmaddasp 11,39,45
|
||||
xvmaddasp 12,39,33
|
||||
xvmaddasp 0,39,32
|
||||
#ifdef CONJ
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
#endif
|
||||
xxpermdi 6,6,6,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
stxvd2x 6,0,9
|
||||
xxpermdi 10,10,10,2
|
||||
stxvd2x 7,9,30
|
||||
xxpermdi 11,11,11,2
|
||||
stxvd2x 8,9,31
|
||||
xxpermdi 12,12,12,2
|
||||
stxvd2x 9,9,12
|
||||
xxpermdi 0,0,0,2
|
||||
stxvd2x 10,9,0
|
||||
stxvd2x 11,9,11
|
||||
stxvd2x 12,9,3
|
||||
stxvd2x 0,9,5
|
||||
addi 9,9,128
|
||||
bgt 7,.L6
|
||||
ld 29,-24(1)
|
||||
ld 30,-16(1)
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L36
|
||||
sldi 11,4,1
|
||||
ld 28,-32(1)
|
||||
b .L4
|
||||
.L36:
|
||||
ld 28,-32(1)
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L44:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,4,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
538
kernel/power/caxpy_power9.S
Normal file
538
kernel/power/caxpy_power9.S
Normal file
@@ -0,0 +1,538 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
caxpy_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L37
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L37:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
li 11,0
|
||||
bne 0,.L38
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 6,8,6
|
||||
add 3,8,9
|
||||
add 9,10,9
|
||||
subfc 5,5,3
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 12,12,12
|
||||
addi 6,5,1
|
||||
addi 5,12,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 12,7,4
|
||||
adde 6,5,6
|
||||
subfic 12,0,4
|
||||
subfe 12,12,12
|
||||
xori 6,6,0x1
|
||||
neg 12,12
|
||||
and 6,6,12
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L39
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 12,12,12
|
||||
addi 0,12,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 12,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
xscvdpspn 0,1
|
||||
xscvdpspn 12,2
|
||||
addi 0,6,-1
|
||||
std 31,-8(1)
|
||||
addis 12,2,.LC2@toc@ha
|
||||
addis 6,2,.LC3@toc@ha
|
||||
li 5,16
|
||||
srdi. 31,0,2
|
||||
addi 6,6,.LC3@toc@l
|
||||
addi 12,12,.LC2@toc@l
|
||||
mtctr 31
|
||||
lxv 41,0(6)
|
||||
lxv 42,0(12)
|
||||
li 6,0
|
||||
xxspltw 0,0,0
|
||||
xxspltw 12,12,0
|
||||
beq 0,.L40
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvx 33,3,5
|
||||
lxvx 44,3,6
|
||||
lxvx 43,9,6
|
||||
lxvx 32,9,5
|
||||
vperm 13,1,12,10
|
||||
vperm 12,1,12,9
|
||||
vperm 8,0,11,10
|
||||
vperm 0,0,11,9
|
||||
xvmulsp 33,12,44
|
||||
xvmulsp 11,12,45
|
||||
xvmaddasp 33,0,45
|
||||
xvmsubmsp 44,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvsubsp 32,32,44
|
||||
#else
|
||||
lxvx 33,3,6
|
||||
lxvx 32,3,5
|
||||
lxvx 43,9,6
|
||||
lxvx 44,9,5
|
||||
vperm 13,0,1,10
|
||||
vperm 0,0,1,9
|
||||
vperm 8,12,11,10
|
||||
vperm 12,12,11,9
|
||||
xvmulsp 33,12,32
|
||||
xvmulsp 11,12,45
|
||||
xvmsubasp 33,0,45
|
||||
xvmaddmsp 32,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvaddsp 32,32,44
|
||||
#endif
|
||||
vmrglw 13,0,1
|
||||
vmrghw 0,0,1
|
||||
stxvx 45,9,6
|
||||
stxvx 32,9,5
|
||||
addi 6,6,32
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 31,-8(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 5,11,2
|
||||
addi 6,4,1
|
||||
addi 9,11,2
|
||||
addi 3,5,4
|
||||
lfsx 12,8,5
|
||||
cmpd 7,7,6
|
||||
lfsx 0,10,5
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,5
|
||||
lfsx 11,8,5
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 5,4,2
|
||||
addi 6,11,4
|
||||
addi 3,9,4
|
||||
lfsx 12,8,9
|
||||
cmpd 7,7,5
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 9,11,6
|
||||
addi 5,6,4
|
||||
lfsx 12,8,6
|
||||
cmpd 7,7,4
|
||||
lfsx 0,10,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L39:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L38:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 45,1,1,0
|
||||
xscvdpspn 12,2
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 44,0,0,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 45,45
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 32,44
|
||||
xxspltw 12,12,0
|
||||
#else
|
||||
fneg 12,2
|
||||
xxpermdi 32,2,2,0
|
||||
xscvdpspn 0,1
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 45,12,12,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 32,32
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 45,45
|
||||
xxspltw 0,0,0
|
||||
#endif
|
||||
vmrgew 0,0,13
|
||||
beq 0,.L5
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 38,16(6)
|
||||
lxv 11,16(9)
|
||||
addi 5,5,8
|
||||
addi 6,6,128
|
||||
addi 9,9,128
|
||||
lxv 39,-96(6)
|
||||
lxv 40,-80(6)
|
||||
lxv 41,-64(6)
|
||||
lxv 42,-48(6)
|
||||
cmpd 7,3,5
|
||||
lxv 43,-32(6)
|
||||
lxv 45,-128(6)
|
||||
lxv 44,-16(6)
|
||||
#ifdef CONJ
|
||||
lxv 0,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,32,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 0,32,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,32,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,12,49
|
||||
xvmaddmsp 40,32,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,12,0
|
||||
xvmaddasp 39,12,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,32,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,12,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,32,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,12,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,32,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,12,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,32,11
|
||||
xvmaddasp 43,12,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,12,37
|
||||
#else
|
||||
lxv 12,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,0,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 12,0,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,0,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,32,49
|
||||
xvmaddmsp 40,0,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,32,12
|
||||
xvmaddasp 39,32,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,0,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,32,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,0,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,32,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,0,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,32,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,0,11
|
||||
xvmaddasp 43,32,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,32,37
|
||||
#endif
|
||||
stxv 43,-32(9)
|
||||
stxv 44,-16(9)
|
||||
bgt 7,.L6
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L33
|
||||
sldi 11,4,1
|
||||
b .L4
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
subf 8,4,7
|
||||
cmpd 7,10,7
|
||||
mtctr 8
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
lfs 12,0(9)
|
||||
lfs 0,4(9)
|
||||
addi 3,3,8
|
||||
addi 9,9,8
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
b .L33
|
||||
.L40:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
.size caxpy_k,.-caxpy_k
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
@@ -25,15 +25,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
|
||||
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
{
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
BLASLONG i = 0;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_x = (__vector float *) x;
|
||||
register __vector float vd_0 = { 0 };
|
||||
register __vector float vd_1 = { 0 };
|
||||
register __vector float vd_2 = { 0 };
|
||||
@@ -41,26 +48,23 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
register __vector float vdd_0 = { 0 };
|
||||
register __vector float vdd_1 = { 0 };
|
||||
register __vector float vdd_2 = { 0 };
|
||||
register __vector float vdd_3 = { 0 };
|
||||
for (; i < n/2; i += 4) {
|
||||
register __vector float vdd_3 = { 0 };
|
||||
BLASLONG i=0;
|
||||
for(;i<n/2;i+=4){
|
||||
|
||||
register __vector float vyy_0 ;
|
||||
register __vector float vyy_1 ;
|
||||
register __vector float vyy_2 ;
|
||||
register __vector float vyy_3 ;
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vx_0= vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
|
||||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
|
||||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
|
||||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
|
||||
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
|
||||
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
|
||||
|
||||
vd_0 += vx_0 * vy_0;
|
||||
vd_1 += vx_1 * vy_1;
|
||||
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
vdd_2 += vx_2 * vyy_2;
|
||||
vdd_3 += vx_3 * vyy_3;
|
||||
|
||||
vptr_x+=4;
|
||||
vptr_y+=4;
|
||||
|
||||
}
|
||||
//aggregate
|
||||
@@ -96,7 +102,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
||||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
|
||||
248
kernel/power/cdot_power9.S
Normal file
248
kernel/power/cdot_power9.S
Normal file
@@ -0,0 +1,248 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "cdot.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl cdot_k
|
||||
.type cdot_k, @function
|
||||
*/
|
||||
PROLOGUE
|
||||
|
||||
cdot_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry cdot_k,.-cdot_k
|
||||
mr. 9,3
|
||||
ble 0,.L10
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L18
|
||||
.L3:
|
||||
mtctr 9
|
||||
xxlxor 2,2,2
|
||||
sldi 5,5,3
|
||||
sldi 7,7,3
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.p2align 4,,15
|
||||
.L9:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L9
|
||||
.L7:
|
||||
#ifdef CONJ
|
||||
fsubs 2,12,2
|
||||
fadds 1,1,8
|
||||
#else
|
||||
fadds 2,2,8
|
||||
fsubs 1,1,9
|
||||
#endif
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L18:
|
||||
cmpdi 7,7,1
|
||||
bne 7,.L3
|
||||
rldicr. 10,9,0,60
|
||||
bne 0,.L19
|
||||
xxlxor 2,2,2
|
||||
li 8,0
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.L4:
|
||||
addi 7,10,1
|
||||
sldi 8,8,2
|
||||
subf 10,10,9
|
||||
cmpd 7,7,9
|
||||
mtctr 10
|
||||
add 4,4,8
|
||||
add 6,6,8
|
||||
bgt 7,.L16
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L16
|
||||
.p2align 4,,15
|
||||
.L8:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L8
|
||||
b .L7
|
||||
.p2align 4,,15
|
||||
.L10:
|
||||
xxlxor 1,1,1
|
||||
fmr 2,1
|
||||
blr
|
||||
.L19:
|
||||
addis 8,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,10,1
|
||||
xxspltib 42,0
|
||||
addi 8,8,.LANCHOR0@toc@l
|
||||
lxv 32,0(8)
|
||||
beq 0,.L12
|
||||
xxlor 6,42,42
|
||||
xxlor 4,42,42
|
||||
xxlor 0,42,42
|
||||
xxlor 7,42,42
|
||||
xxlor 5,42,42
|
||||
xxlor 3,42,42
|
||||
xxlor 12,42,42
|
||||
mr 7,4
|
||||
mr 8,6
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 43,0(8)
|
||||
lxv 44,16(8)
|
||||
addi 5,5,4
|
||||
addi 8,8,64
|
||||
addi 7,7,64
|
||||
lxv 45,-32(8)
|
||||
lxv 33,-16(8)
|
||||
lxv 8,-64(7)
|
||||
lxv 9,-48(7)
|
||||
cmpd 7,3,5
|
||||
lxv 10,-32(7)
|
||||
lxv 11,-16(7)
|
||||
vpermr 6,11,11,0
|
||||
vpermr 7,12,12,0
|
||||
vpermr 8,13,13,0
|
||||
vpermr 9,1,1,0
|
||||
xvmaddasp 12,43,8
|
||||
xvmaddasp 3,44,9
|
||||
xvmaddasp 0,8,38
|
||||
xvmaddasp 4,9,39
|
||||
xvmaddasp 6,10,40
|
||||
xvmaddasp 5,45,10
|
||||
xvmaddasp 42,11,41
|
||||
xvmaddasp 7,33,11
|
||||
bgt 7,.L6
|
||||
xvaddsp 12,12,3
|
||||
xvaddsp 0,0,4
|
||||
xvaddsp 12,12,5
|
||||
xvaddsp 0,0,6
|
||||
xvaddsp 12,12,7
|
||||
xvaddsp 42,0,42
|
||||
.L5:
|
||||
#ifdef CONJ
|
||||
xxpermdi 8,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 8,8,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,8,8,3
|
||||
xxsldwi 12,0,0,3
|
||||
xxsldwi 8,8,8,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 12,12
|
||||
xscvspdp 8,8
|
||||
#else
|
||||
xxpermdi 9,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 9,9,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,9,9,3
|
||||
xxsldwi 2,0,0,3
|
||||
xxsldwi 9,9,9,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 8,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 9,9
|
||||
#endif
|
||||
xscvspdp 2,0
|
||||
bgt 7,.L4
|
||||
b .L7
|
||||
.L12:
|
||||
xxlor 12,42,42
|
||||
b .L5
|
||||
.L16:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L8
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size cdot_k,.-cdot_k
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
@@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER)
|
||||
#endif
|
||||
|
||||
|
||||
293
kernel/power/cgemm_kernel_power9.S
Normal file
293
kernel/power/cgemm_kernel_power9.S
Normal file
@@ -0,0 +1,293 @@
|
||||
/***************************************************************************
|
||||
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
/**************************************************************************************
|
||||
* Abdelrauf(quickwritereader@gmail.com)
|
||||
* BLASTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
* LAPACK-TEST : OK
|
||||
**************************************************************************************/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
#include "def_vsx.h"
|
||||
|
||||
|
||||
#define LOAD ld
|
||||
#define STACKSIZE (512 )
|
||||
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||
#define M r3
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
|
||||
#define A r8
|
||||
#define B r9
|
||||
#define C r10
|
||||
#define LDC r6
|
||||
#define OFFSET r7
|
||||
|
||||
|
||||
#define alpha_r vs19
|
||||
#define alpha_i vs20
|
||||
#define save_permute_1 vs21
|
||||
#define permute_mask vs22
|
||||
#define o0 0
|
||||
|
||||
|
||||
#define T1 r11
|
||||
#define T2 r12
|
||||
#define T3 r14
|
||||
#define T4 r15
|
||||
#define T5 r16
|
||||
#define T6 r17
|
||||
#define L r18
|
||||
#define T7 r19
|
||||
#define T8 r20
|
||||
#define TEMP_REG r21
|
||||
#define I r22
|
||||
#define J r23
|
||||
#define AO r24
|
||||
#define BO r25
|
||||
#define CO r26
|
||||
#define T9 r27
|
||||
#define T10 r28
|
||||
#define PRE r29
|
||||
|
||||
#define T12 r30
|
||||
#define T13 r31
|
||||
|
||||
#include "cgemm_macros_power9.S"
|
||||
|
||||
.equ perm_const1, 0x0405060700010203
|
||||
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||
.equ save_permute_11, 0x0405060714151617
|
||||
|
||||
|
||||
|
||||
#ifndef NEEDPARAM
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
|
||||
addi SP, SP, -STACKSIZE
|
||||
mflr r0
|
||||
|
||||
|
||||
stfd f14, 0(SP)
|
||||
stfd f15, 8(SP)
|
||||
stfd f16, 16(SP)
|
||||
stfd f17, 24(SP)
|
||||
|
||||
stfd f18, 32(SP)
|
||||
stfd f19, 40(SP)
|
||||
stfd f20, 48(SP)
|
||||
stfd f21, 56(SP)
|
||||
|
||||
stfd f22, 64(SP)
|
||||
stfd f23, 72(SP)
|
||||
stfd f24, 80(SP)
|
||||
stfd f25, 88(SP)
|
||||
|
||||
stfd f26, 96(SP)
|
||||
stfd f27, 104(SP)
|
||||
stfd f28, 112(SP)
|
||||
stfd f29, 120(SP)
|
||||
|
||||
stfd f30, 128(SP)
|
||||
stfd f31, 136(SP)
|
||||
|
||||
|
||||
std r31, 144(SP)
|
||||
std r30, 152(SP)
|
||||
std r29, 160(SP)
|
||||
std r28, 168(SP)
|
||||
std r27, 176(SP)
|
||||
std r26, 184(SP)
|
||||
std r25, 192(SP)
|
||||
std r24, 200(SP)
|
||||
std r23, 208(SP)
|
||||
std r22, 216(SP)
|
||||
std r21, 224(SP)
|
||||
std r20, 232(SP)
|
||||
std r19, 240(SP)
|
||||
std r18, 248(SP)
|
||||
std r17, 256(SP)
|
||||
std r16, 264(SP)
|
||||
std r15, 272(SP)
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv vs52, 288(SP)
|
||||
stxv vs53, 304(SP)
|
||||
stxv vs54, 320(SP)
|
||||
stxv vs55, 336(SP)
|
||||
stxv vs56, 352(SP)
|
||||
stxv vs57, 368(SP)
|
||||
stxv vs58, 384(SP)
|
||||
stxv vs59, 400(SP)
|
||||
stxv vs60, 416(SP)
|
||||
stxv vs61, 432(SP)
|
||||
stxv vs62, 448(SP)
|
||||
stxv vs63, 464(SP)
|
||||
std r0, FLINK_SAVE(SP)
|
||||
|
||||
|
||||
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
slwi LDC, LDC, ZBASE_SHIFT
|
||||
|
||||
|
||||
|
||||
/*alpha is stored in f1. convert to single and splat*/
|
||||
xscvdpspn alpha_r,vs1
|
||||
xscvdpspn alpha_i,vs2
|
||||
xxspltw alpha_r,alpha_r,0
|
||||
xxspltw alpha_i,alpha_i,0
|
||||
/*load reverse permute mask for big endian
|
||||
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||
*/
|
||||
|
||||
lis T2, perm_const2@highest
|
||||
lis T1, perm_const1@highest
|
||||
lis T3, save_permute_12@highest
|
||||
lis T4, save_permute_11@highest
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@higher
|
||||
ori T1, T1, perm_const1@higher
|
||||
ori T3, T3, save_permute_12@higher
|
||||
ori T4, T4, save_permute_11@higher
|
||||
|
||||
|
||||
rldicr T2, T2, 32, 31
|
||||
rldicr T1, T1, 32, 31
|
||||
rldicr T3, T3, 32, 31
|
||||
rldicr T4, T4, 32, 31
|
||||
|
||||
oris T2, T2, perm_const2@h
|
||||
oris T1, T1, perm_const1@h
|
||||
oris T3, T3, save_permute_12@h
|
||||
oris T4, T4, save_permute_11@h
|
||||
|
||||
|
||||
ori T2, T2, perm_const2@l
|
||||
ori T1, T1, perm_const1@l
|
||||
ori T3, T3, save_permute_12@l
|
||||
ori T4, T4, save_permute_11@l
|
||||
|
||||
|
||||
li r0,0
|
||||
li PRE,512
|
||||
|
||||
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||
/*negate for this case as we will use addition -1*(a+b) */
|
||||
xvnegsp alpha_r,alpha_r
|
||||
xvnegsp alpha_i,alpha_i
|
||||
#endif
|
||||
|
||||
mtvsrdd permute_mask,T2,T1
|
||||
mtvsrdd save_permute_1,T3,T4
|
||||
|
||||
/*mask is reverse permute so we have to make it inner permute */
|
||||
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||
|
||||
#include "cgemm_logic_power9.S"
|
||||
|
||||
.L999:
|
||||
lfd f14, 0(SP)
|
||||
lfd f15, 8(SP)
|
||||
lfd f16, 16(SP)
|
||||
lfd f17, 24(SP)
|
||||
|
||||
lfd f18, 32(SP)
|
||||
lfd f19, 40(SP)
|
||||
lfd f20, 48(SP)
|
||||
lfd f21, 56(SP)
|
||||
|
||||
lfd f22, 64(SP)
|
||||
lfd f23, 72(SP)
|
||||
lfd f24, 80(SP)
|
||||
lfd f25, 88(SP)
|
||||
|
||||
lfd f26, 96(SP)
|
||||
lfd f27, 104(SP)
|
||||
lfd f28, 112(SP)
|
||||
lfd f29, 120(SP)
|
||||
|
||||
lfd f30, 128(SP)
|
||||
lfd f31, 136(SP)
|
||||
|
||||
ld r31, 144(SP)
|
||||
ld r30, 152(SP)
|
||||
ld r29, 160(SP)
|
||||
ld r28, 168(SP)
|
||||
ld r27, 176(SP)
|
||||
ld r26, 184(SP)
|
||||
ld r25, 192(SP)
|
||||
ld r24, 200(SP)
|
||||
ld r23, 208(SP)
|
||||
ld r22, 216(SP)
|
||||
ld r21, 224(SP)
|
||||
ld r20, 232(SP)
|
||||
ld r19, 240(SP)
|
||||
ld r18, 248(SP)
|
||||
ld r17, 256(SP)
|
||||
ld r16, 264(SP)
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
ld r0, FLINK_SAVE(SP)
|
||||
|
||||
lxv vs52, 288(SP)
|
||||
lxv vs53, 304(SP)
|
||||
lxv vs54, 320(SP)
|
||||
lxv vs55, 336(SP)
|
||||
lxv vs56, 352(SP)
|
||||
lxv vs57, 368(SP)
|
||||
lxv vs58, 384(SP)
|
||||
lxv vs59, 400(SP)
|
||||
mtlr r0
|
||||
lxv vs60, 416(SP)
|
||||
lxv vs61, 432(SP)
|
||||
lxv vs62, 448(SP)
|
||||
lxv vs63, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
|
||||
EPILOGUE
|
||||
#endif
|
||||
2816
kernel/power/cgemm_logic_power9.S
Normal file
2816
kernel/power/cgemm_logic_power9.S
Normal file
File diff suppressed because it is too large
Load Diff
3019
kernel/power/cgemm_macros_power9.S
Normal file
3019
kernel/power/cgemm_macros_power9.S
Normal file
File diff suppressed because it is too large
Load Diff
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define NBMAX 1024
|
||||
|
||||
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
|
||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
register __vector float *vptr_a2 = (__vector float *) a2;
|
||||
register __vector float *vptr_a3 = (__vector float *) a3;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va2 = vptr_a2[i];
|
||||
register __vector float va3 = vptr_a3[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va2_1 = vptr_a2[i + 1];
|
||||
register __vector float va3_1 = vptr_a3[i + 1];
|
||||
BLASLONG i2=16;
|
||||
for (;i< n * 8; i+=32,i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
|
||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
|
||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
|
||||
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
|
||||
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
|
||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -118,17 +119,19 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
register __vector float va1x = vec_perm(va1, va1,swap_mask);
|
||||
@@ -136,8 +139,8 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
|
||||
}
|
||||
@@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) ap;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va0x*vx0_i;
|
||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -176,7 +181,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
BLASLONG i=0;
|
||||
|
||||
|
||||
if (inc_dest != 2) {
|
||||
@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
||||
|
||||
register __vector float *vptr_src = (__vector float *) src;
|
||||
register __vector float *vptr_y = (__vector float *) dest;
|
||||
for (i = 0; i < n/2; i += 2 ){
|
||||
|
||||
register __vector float vy_0 = vptr_y[i];
|
||||
register __vector float vy_1 = vptr_y[i +1];
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
|
||||
register __vector float vsrc = vptr_src[i];
|
||||
register __vector float vsrc_1 = vptr_src[i + 1];
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
vptr_y[i] = vy_0;
|
||||
vptr_y[i+1 ] = vy_1;
|
||||
register __vector float vsrc = vec_vsx_ld(i,vptr_src);
|
||||
register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src);
|
||||
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
|
||||
}
|
||||
|
||||
@@ -237,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG i=0;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
@@ -247,8 +256,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
FLOAT xbuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *ybuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
@@ -29,10 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define NBMAX 1024
|
||||
#include <altivec.h>
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
@@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* vptr_a0 = (__vector float*) a0;
|
||||
__vector float* vptr_a1 = (__vector float*) a1;
|
||||
__vector float* vptr_a2 = (__vector float*) a2;
|
||||
__vector float* vptr_a3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
|
||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
|
||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
|
||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
|
||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
|
||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
|
||||
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
vtemp1_p += vx_0*va1 + vx_1*va1_1;
|
||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
|
||||
vtemp2_p += vx_0*va2 + vx_1*va2_1;
|
||||
vtemp2_r += vxr_0*va2 + vxr_1*va2_1;
|
||||
vtemp3_p += vx_0*va3 + vx_1*va3_1;
|
||||
vtemp3_r += vxr_0*va3 + vxr_1*va3_1;
|
||||
|
||||
}
|
||||
|
||||
@@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
@@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
|
||||
|
||||
__vector float* vptr_a0 = (__vector float*) a0;
|
||||
__vector float* vptr_a1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
vtemp1_p += vx_0*va1 + vx_1*va1_1;
|
||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
@@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
||||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) ap;
|
||||
__vector float* vptr_a0 = (__vector float*) ap;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
@@ -249,8 +276,8 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
BLASLONG i=0;
|
||||
BLASLONG j=0;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
@@ -260,8 +287,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
FLOAT ybuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
@@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stfs f2, ALPHA_I_SP
|
||||
// stw r0, FZERO
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifdef __64BIT__
|
||||
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
@@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -271,7 +271,7 @@ li r11,0
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -135,18 +135,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
std r14, 280(SP)
|
||||
|
||||
|
||||
stxv v20, 288(SP)
|
||||
stxv v21, 304(SP)
|
||||
stxv v22, 320(SP)
|
||||
stxv v23, 336(SP)
|
||||
stxv v24, 352(SP)
|
||||
stxv v25, 368(SP)
|
||||
stxv v26, 384(SP)
|
||||
stxv v27, 400(SP)
|
||||
stxv v28, 416(SP)
|
||||
stxv v29, 432(SP)
|
||||
stxv v30, 448(SP)
|
||||
stxv v31, 464(SP)
|
||||
stxv vs52, 288(SP)
|
||||
stxv vs53, 304(SP)
|
||||
stxv vs54, 320(SP)
|
||||
stxv vs55, 336(SP)
|
||||
stxv vs56, 352(SP)
|
||||
stxv vs57, 368(SP)
|
||||
stxv vs58, 384(SP)
|
||||
stxv vs59, 400(SP)
|
||||
stxv vs60, 416(SP)
|
||||
stxv vs61, 432(SP)
|
||||
stxv vs62, 448(SP)
|
||||
stxv vs63, 464(SP)
|
||||
|
||||
|
||||
stfd f1, ALPHA_SP
|
||||
@@ -229,18 +229,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
ld r15, 272(SP)
|
||||
ld r14, 280(SP)
|
||||
|
||||
lxv v20, 288(SP)
|
||||
lxv v21, 304(SP)
|
||||
lxv v22, 320(SP)
|
||||
lxv v23, 336(SP)
|
||||
lxv v24, 352(SP)
|
||||
lxv v25, 368(SP)
|
||||
lxv v26, 384(SP)
|
||||
lxv v27, 400(SP)
|
||||
lxv v28, 416(SP)
|
||||
lxv v29, 432(SP)
|
||||
lxv v30, 448(SP)
|
||||
lxv v31, 464(SP)
|
||||
lxv vs52, 288(SP)
|
||||
lxv vs53, 304(SP)
|
||||
lxv vs54, 320(SP)
|
||||
lxv vs55, 336(SP)
|
||||
lxv vs56, 352(SP)
|
||||
lxv vs57, 368(SP)
|
||||
lxv vs58, 384(SP)
|
||||
lxv vs59, 400(SP)
|
||||
lxv vs60, 416(SP)
|
||||
lxv vs61, 432(SP)
|
||||
lxv vs62, 448(SP)
|
||||
lxv vs63, 464(SP)
|
||||
|
||||
addi SP, SP, STACKSIZE
|
||||
blr
|
||||
|
||||
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
BLASLONG lda4 = lda << 2;
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
|
||||
FLOAT xbuffer[8] __attribute__ ((aligned (16)));
|
||||
FLOAT *ybuffer;
|
||||
|
||||
if ( m < 1 ) return(0);
|
||||
|
||||
@@ -581,9 +581,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
|
||||
BLASLONG m1;
|
||||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
BLASLONG n2;
|
||||
FLOAT ybuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
||||
@@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -257,8 +257,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
stvx v31, r11, r0
|
||||
li r11,0
|
||||
|
||||
stw r31, 144(SP)
|
||||
|
||||
stfd f1, ALPHA_SP
|
||||
stw r0, FZERO
|
||||
|
||||
@@ -271,7 +269,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -217,7 +217,7 @@ li r11,0
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
stfd f31, 16(SP)
|
||||
stw r0, 24(SP)
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#else
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -186,7 +186,7 @@
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
@@ -228,7 +228,7 @@
|
||||
|
||||
#else
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
mr PREA, r10
|
||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
|
||||
@@ -58,7 +58,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -192,7 +192,7 @@
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
@@ -226,7 +226,7 @@
|
||||
li PREC, 4 * SIZE
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
mr PREA, r10
|
||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -184,7 +184,7 @@
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -46,7 +46,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#define A r6
|
||||
#define B r7
|
||||
#define C r8
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -187,7 +187,7 @@
|
||||
li PREC, 4 * SIZE
|
||||
#else
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
mr PREA, r10
|
||||
lwz PREB, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -183,7 +183,7 @@
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -59,7 +59,7 @@
|
||||
#define N r4
|
||||
#define K r5
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define A r6
|
||||
#define B r7
|
||||
@@ -183,7 +183,7 @@
|
||||
slwi LDC, LDC, BASE_SHIFT
|
||||
|
||||
#if defined(TRMMKERNEL)
|
||||
#if defined(linux) && defined(__64BIT__)
|
||||
#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__)
|
||||
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
#endif
|
||||
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define M r3
|
||||
#define N r4
|
||||
@@ -252,7 +252,7 @@
|
||||
stw r27, 196(SP)
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define M r3
|
||||
#define N r4
|
||||
@@ -199,7 +199,7 @@
|
||||
stw r23, 180(SP)
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define M r3
|
||||
#define N r4
|
||||
@@ -260,7 +260,7 @@
|
||||
stw r29, 220(SP)
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define M r3
|
||||
#define N r4
|
||||
@@ -190,7 +190,7 @@
|
||||
stw r22, 192(SP)
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz INCY, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
|
||||
@@ -47,7 +47,7 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
#define M r3
|
||||
#define N r4
|
||||
@@ -224,7 +224,7 @@
|
||||
stw r27, 196(SP)
|
||||
#endif
|
||||
|
||||
#ifdef linux
|
||||
#if defined(linux) || defined(__FreeBSD__)
|
||||
#ifndef __64BIT__
|
||||
lwz LDA, FRAMESLOT(0) + STACKSIZE(SP)
|
||||
lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP)
|
||||
|
||||
458
kernel/power/icamax_power8.S
Normal file
458
kernel/power/icamax_power8.S
Normal file
@@ -0,0 +1,458 @@
|
||||
/* .file "icamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamax_k
|
||||
.type icamax_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
icamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamax_k,.-icamax_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L54
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,9,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 3,0
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L52:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L55
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
xxlxor 11,11,11
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
li 3,0
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L52
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
li 0,-144
|
||||
std 31,-8(1)
|
||||
addis 5,2,.LC2@toc@ha
|
||||
vspltisw 18,0
|
||||
vspltisw 19,0
|
||||
addis 6,2,.LC3@toc@ha
|
||||
addi 5,5,.LC2@toc@l
|
||||
stvx 24,1,0
|
||||
li 0,-128
|
||||
addi 6,6,.LC3@toc@l
|
||||
xxlor 49,50,50
|
||||
addis 7,2,.LC4@toc@ha
|
||||
lxvd2x 44,0,5
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stvx 25,1,0
|
||||
li 0,-112
|
||||
addi 7,7,.LC4@toc@l
|
||||
lxvd2x 45,0,6
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
stvx 26,1,0
|
||||
li 0,-96
|
||||
addi 10,10,.LC5@toc@l
|
||||
addi 6,6,.LC7@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stvx 27,1,0
|
||||
li 0,-80
|
||||
lxvd2x 46,0,10
|
||||
xxpermdi 44,44,44,2
|
||||
mr 10,4
|
||||
lxvd2x 48,0,6
|
||||
lxvd2x 47,0,5
|
||||
xxpermdi 45,45,45,2
|
||||
li 6,0
|
||||
stvx 28,1,0
|
||||
li 0,-64
|
||||
xxlnand 44,44,44
|
||||
xxlnand 45,45,45
|
||||
stvx 29,1,0
|
||||
li 0,-48
|
||||
vspltisw 29,8
|
||||
vadduwm 29,29,29
|
||||
xxpermdi 46,46,46,2
|
||||
stvx 30,1,0
|
||||
li 0,-32
|
||||
xxpermdi 47,47,47,2
|
||||
xxpermdi 48,48,48,2
|
||||
stvx 31,1,0
|
||||
lxvd2x 63,0,7
|
||||
addis 7,2,.LC8@toc@ha
|
||||
addi 7,7,.LC8@toc@l
|
||||
lxvd2x 62,0,7
|
||||
xxpermdi 63,63,63,2
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
addi 3,10,16
|
||||
addi 5,10,32
|
||||
lxvd2x 34,0,10
|
||||
addi 7,10,64
|
||||
addi 31,10,48
|
||||
addi 12,10,80
|
||||
addi 11,10,96
|
||||
lxvd2x 36,0,3
|
||||
lxvd2x 37,0,5
|
||||
addi 3,10,112
|
||||
addi 5,10,128
|
||||
lxvd2x 38,0,7
|
||||
lxvd2x 7,0,31
|
||||
addi 7,10,160
|
||||
addi 31,10,144
|
||||
lxvd2x 33,0,12
|
||||
lxvd2x 39,0,11
|
||||
addi 12,10,176
|
||||
addi 11,10,192
|
||||
lxvd2x 8,0,3
|
||||
lxvd2x 40,0,5
|
||||
xxpermdi 34,34,34,2
|
||||
addi 3,10,208
|
||||
addi 5,10,224
|
||||
lxvd2x 41,0,7
|
||||
lxvd2x 9,0,31
|
||||
addi 7,10,240
|
||||
lxvd2x 10,0,12
|
||||
lxvd2x 42,0,11
|
||||
xxpermdi 37,37,37,2
|
||||
xxpermdi 36,36,36,2
|
||||
addi 6,6,32
|
||||
lxvd2x 32,0,3
|
||||
lxvd2x 43,0,5
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 38,38,38,2
|
||||
cmpd 7,8,6
|
||||
addi 10,10,256
|
||||
lxvd2x 11,0,7
|
||||
xxpermdi 39,39,39,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 10,10,10,2
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 11,11,11,2
|
||||
xvabssp 57,37
|
||||
xvabssp 58,39
|
||||
xvabssp 35,40
|
||||
xvabssp 59,41
|
||||
xvabssp 34,34
|
||||
xvabssp 33,33
|
||||
xvabssp 32,32
|
||||
xvabssp 60,43
|
||||
xvabssp 36,36
|
||||
xvabssp 37,7
|
||||
xvabssp 38,38
|
||||
xvabssp 39,8
|
||||
xvabssp 40,9
|
||||
xvabssp 41,10
|
||||
xvabssp 42,42
|
||||
xvabssp 43,11
|
||||
vperm 24,4,2,12
|
||||
vperm 4,4,2,13
|
||||
vperm 2,5,25,12
|
||||
vperm 5,5,25,13
|
||||
vperm 25,1,6,12
|
||||
vperm 6,1,6,13
|
||||
vperm 1,7,26,12
|
||||
vperm 7,7,26,13
|
||||
vperm 26,8,3,12
|
||||
vperm 8,8,3,13
|
||||
vperm 3,9,27,12
|
||||
vperm 9,9,27,13
|
||||
vperm 27,0,10,12
|
||||
vperm 10,0,10,13
|
||||
vperm 0,11,28,12
|
||||
vperm 11,11,28,13
|
||||
xvaddsp 12,33,39
|
||||
xvaddsp 38,57,38
|
||||
xvaddsp 0,32,43
|
||||
xvaddsp 42,59,42
|
||||
xvaddsp 36,56,36
|
||||
xvaddsp 37,34,37
|
||||
xvaddsp 40,58,40
|
||||
xvaddsp 41,35,41
|
||||
xvcmpgtsp 32,12,38
|
||||
xvcmpgtsp 33,0,42
|
||||
xvcmpgtsp 43,37,36
|
||||
xvcmpgtsp 39,41,40
|
||||
xxsel 12,38,12,32
|
||||
xxsel 38,47,48,32
|
||||
xxsel 0,42,0,33
|
||||
xxsel 42,47,48,33
|
||||
xxsel 37,36,37,43
|
||||
xxsel 43,63,46,43
|
||||
xxsel 41,40,41,39
|
||||
xxsel 39,63,46,39
|
||||
xvcmpgtsp 32,12,37
|
||||
xvcmpgtsp 33,0,41
|
||||
xxsel 12,37,12,32
|
||||
xxsel 43,43,38,32
|
||||
xxsel 0,41,0,33
|
||||
xxsel 33,39,42,33
|
||||
xvcmpgtsp 32,0,12
|
||||
vadduwm 1,1,29
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,0,51
|
||||
vadduwm 0,17,0
|
||||
vadduwm 17,17,30
|
||||
xxsel 50,50,32,33
|
||||
xxsel 51,51,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,51,51,3
|
||||
xxsldwi 12,51,51,2
|
||||
vspltw 0,18,3
|
||||
xxsldwi 0,51,51,1
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,18,2
|
||||
xscvspdp 0,0
|
||||
mfvsrwz 7,50
|
||||
mfvsrwz 5,32
|
||||
vspltw 0,18,0
|
||||
xscvspdp 51,51
|
||||
mfvsrwz 10,32
|
||||
fcmpu 7,11,12
|
||||
rldicl 3,6,0,32
|
||||
fmr 10,0
|
||||
rldicl 11,7,0,32
|
||||
rldicl 31,5,0,32
|
||||
rldicl 0,10,0,32
|
||||
beq 7,.L56
|
||||
bnl 7,.L8
|
||||
fmr 11,12
|
||||
mr 3,31
|
||||
.L8:
|
||||
xscmpudp 7,0,51
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L57
|
||||
blt 7,.L58
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
li 0,-144
|
||||
ld 31,-8(1)
|
||||
addi 3,3,1
|
||||
lvx 24,1,0
|
||||
li 0,-128
|
||||
lvx 25,1,0
|
||||
li 0,-112
|
||||
lvx 26,1,0
|
||||
li 0,-96
|
||||
lvx 27,1,0
|
||||
li 0,-80
|
||||
lvx 28,1,0
|
||||
li 0,-64
|
||||
lvx 29,1,0
|
||||
li 0,-48
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bnl 7,.L13
|
||||
xscpsgndp 10,51,51
|
||||
mr 11,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L57:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L58:
|
||||
fmr 11,10
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
.size icamax_k,.-icamax_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC8:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
387
kernel/power/icamax_power9.S
Normal file
387
kernel/power/icamax_power9.S
Normal file
@@ -0,0 +1,387 @@
|
||||
.file "icamax.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamax_k
|
||||
.type icamax_k, @function
|
||||
icamax_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamax_k,.-icamax_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L53
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,9,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
li 3,0
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L51:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L53:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L54
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
subf 6,8,9
|
||||
li 3,0
|
||||
xxlxor 11,11,11
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L51
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
addis 11,2,.LC2@toc@ha
|
||||
addis 3,2,.LC3@toc@ha
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
xxspltib 47,0
|
||||
addis 7,2,.LC4@toc@ha
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stxv 58,-96(1)
|
||||
stxv 59,-80(1)
|
||||
addi 11,11,.LC2@toc@l
|
||||
addi 3,3,.LC3@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
addi 6,6,.LC7@toc@l
|
||||
stxv 62,-32(1)
|
||||
stxv 63,-16(1)
|
||||
xxspltib 58,16
|
||||
addi 7,7,.LC4@toc@l
|
||||
addi 10,10,.LC5@toc@l
|
||||
xxspltib 59,32
|
||||
lxv 44,0(11)
|
||||
lxv 45,0(3)
|
||||
xxspltib 48,0
|
||||
lxv 62,0(5)
|
||||
xxlor 46,47,47
|
||||
lxv 63,0(6)
|
||||
stxv 60,-64(1)
|
||||
stxv 61,-48(1)
|
||||
lxv 60,0(7)
|
||||
lxv 61,0(10)
|
||||
li 7,0
|
||||
mr 10,4
|
||||
vextsb2w 26,26
|
||||
vextsb2w 27,27
|
||||
stxv 56,-128(1)
|
||||
stxv 57,-112(1)
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
lxv 0,0(10)
|
||||
addi 7,7,32
|
||||
addi 10,10,256
|
||||
cmpd 7,8,7
|
||||
xvabssp 34,0
|
||||
lxv 0,-240(10)
|
||||
xvabssp 42,0
|
||||
lxv 0,-224(10)
|
||||
xvabssp 49,0
|
||||
lxv 0,-208(10)
|
||||
vpermr 25,10,2,12
|
||||
vpermr 2,10,2,13
|
||||
xvabssp 35,0
|
||||
lxv 0,-192(10)
|
||||
xvaddsp 34,57,34
|
||||
xvabssp 36,0
|
||||
lxv 0,-176(10)
|
||||
vpermr 10,3,17,12
|
||||
vpermr 3,3,17,13
|
||||
xvabssp 33,0
|
||||
lxv 0,-160(10)
|
||||
xvaddsp 10,42,35
|
||||
xvabssp 50,0
|
||||
lxv 0,-144(10)
|
||||
vpermr 17,1,4,12
|
||||
vpermr 4,1,4,13
|
||||
xvabssp 37,0
|
||||
lxv 0,-128(10)
|
||||
xvaddsp 36,49,36
|
||||
xvabssp 38,0
|
||||
lxv 0,-112(10)
|
||||
vpermr 1,5,18,12
|
||||
vpermr 5,5,18,13
|
||||
xvabssp 43,0
|
||||
lxv 0,-96(10)
|
||||
xvaddsp 12,33,37
|
||||
xvabssp 51,0
|
||||
lxv 0,-80(10)
|
||||
vpermr 18,11,6,12
|
||||
vpermr 6,11,6,13
|
||||
xvabssp 39,0
|
||||
lxv 0,-64(10)
|
||||
xvaddsp 38,50,38
|
||||
xvabssp 40,0
|
||||
lxv 0,-48(10)
|
||||
vpermr 11,7,19,12
|
||||
vpermr 7,7,19,13
|
||||
xvabssp 32,0
|
||||
lxv 0,-32(10)
|
||||
xvaddsp 11,43,39
|
||||
xvcmpgtsp 39,10,34
|
||||
xvcmpgtsp 43,12,36
|
||||
xvabssp 56,0
|
||||
lxv 0,-16(10)
|
||||
vpermr 19,0,8,12
|
||||
vpermr 8,0,8,13
|
||||
xxsel 10,34,10,39
|
||||
xxsel 12,36,12,43
|
||||
xxsel 39,60,61,39
|
||||
xxsel 43,62,63,43
|
||||
xvabssp 41,0
|
||||
xvaddsp 40,51,40
|
||||
vpermr 0,9,24,12
|
||||
vpermr 9,9,24,13
|
||||
xvaddsp 0,32,41
|
||||
xvcmpgtsp 41,11,38
|
||||
xvcmpgtsp 32,12,10
|
||||
xvcmpgtsp 42,0,40
|
||||
xxsel 11,38,11,41
|
||||
xxsel 12,10,12,32
|
||||
xxsel 43,39,43,32
|
||||
xxsel 41,60,61,41
|
||||
xxsel 0,40,0,42
|
||||
xxsel 42,62,63,42
|
||||
xvcmpgtsp 33,0,11
|
||||
xxsel 0,11,0,33
|
||||
xxsel 33,41,42,33
|
||||
xvcmpgtsp 32,0,12
|
||||
vadduwm 1,1,26
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,0,48
|
||||
vadduwm 0,14,0
|
||||
vadduwm 14,14,27
|
||||
xxsel 47,47,32,33
|
||||
xxsel 48,48,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,48,48,3
|
||||
xxsldwi 12,48,48,2
|
||||
li 10,0
|
||||
li 3,12
|
||||
xxsldwi 0,48,48,1
|
||||
xscvspdp 48,48
|
||||
vextuwrx 6,10,15
|
||||
li 10,4
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
xscvspdp 0,0
|
||||
vextuwrx 5,10,15
|
||||
li 10,8
|
||||
vextuwrx 7,10,15
|
||||
vextuwrx 10,3,15
|
||||
rldicl 12,5,0,32
|
||||
rldicl 3,6,0,32
|
||||
rldicl 11,7,0,32
|
||||
rldicl 0,10,0,32
|
||||
fcmpu 7,11,12
|
||||
fmr 10,0
|
||||
beq 7,.L55
|
||||
bnl 7,.L8
|
||||
mr 3,12
|
||||
fmr 11,12
|
||||
.L8:
|
||||
xscmpudp 7,0,48
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L56
|
||||
bnl 7,.L17
|
||||
mr 3,11
|
||||
fmr 11,10
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
subf 6,8,9
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
lxv 56,-128(1)
|
||||
lxv 57,-112(1)
|
||||
addi 3,3,1
|
||||
lxv 58,-96(1)
|
||||
lxv 59,-80(1)
|
||||
lxv 60,-64(1)
|
||||
lxv 61,-48(1)
|
||||
lxv 62,-32(1)
|
||||
lxv 63,-16(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bnl 7,.L13
|
||||
mr 11,0
|
||||
xscpsgndp 10,48,48
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size icamax_k,.-icamax_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
457
kernel/power/icamin_power8.S
Normal file
457
kernel/power/icamin_power8.S
Normal file
@@ -0,0 +1,457 @@
|
||||
/* .file "icamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamin_k
|
||||
.type icamin_k, @function
|
||||
*/
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
.localentry icamin_k,.-icamin_k
|
||||
#endif
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,5,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L54
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 3,0
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bnl 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L52:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L55
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
li 3,0
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L52
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
li 0,-128
|
||||
std 31,-8(1)
|
||||
addis 5,2,.LC2@toc@ha
|
||||
xscvdpspn 11,11
|
||||
vspltisw 19,0
|
||||
addis 6,2,.LC3@toc@ha
|
||||
addi 5,5,.LC2@toc@l
|
||||
stvx 25,1,0
|
||||
li 0,-112
|
||||
addi 6,6,.LC3@toc@l
|
||||
xxlor 50,51,51
|
||||
addis 7,2,.LC4@toc@ha
|
||||
lxvd2x 44,0,5
|
||||
addis 10,2,.LC5@toc@ha
|
||||
stvx 26,1,0
|
||||
li 0,-96
|
||||
addi 7,7,.LC4@toc@l
|
||||
lxvd2x 45,0,6
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
stvx 27,1,0
|
||||
li 0,-80
|
||||
addi 10,10,.LC5@toc@l
|
||||
xxspltw 5,11,0
|
||||
addi 6,6,.LC7@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stvx 28,1,0
|
||||
li 0,-64
|
||||
lxvd2x 47,0,10
|
||||
xxpermdi 44,44,44,2
|
||||
mr 10,4
|
||||
lxvd2x 49,0,6
|
||||
lxvd2x 48,0,5
|
||||
xxpermdi 45,45,45,2
|
||||
li 6,0
|
||||
stvx 29,1,0
|
||||
li 0,-48
|
||||
xxlnand 44,44,44
|
||||
xxlnand 45,45,45
|
||||
stvx 30,1,0
|
||||
lxvd2x 62,0,7
|
||||
addis 7,2,.LC8@toc@ha
|
||||
li 0,-32
|
||||
addi 7,7,.LC8@toc@l
|
||||
xxpermdi 47,47,47,2
|
||||
stvx 31,1,0
|
||||
vspltisw 31,8
|
||||
xxpermdi 48,48,48,2
|
||||
lxvd2x 46,0,7
|
||||
vadduwm 31,31,31
|
||||
xxpermdi 49,49,49,2
|
||||
xxpermdi 62,62,62,2
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
addi 3,10,16
|
||||
addi 5,10,32
|
||||
lxvd2x 34,0,10
|
||||
addi 7,10,64
|
||||
addi 31,10,48
|
||||
addi 12,10,80
|
||||
addi 11,10,96
|
||||
lxvd2x 36,0,3
|
||||
lxvd2x 37,0,5
|
||||
addi 3,10,112
|
||||
addi 5,10,128
|
||||
lxvd2x 38,0,7
|
||||
lxvd2x 6,0,31
|
||||
addi 7,10,160
|
||||
addi 31,10,144
|
||||
lxvd2x 33,0,12
|
||||
lxvd2x 39,0,11
|
||||
addi 12,10,176
|
||||
addi 11,10,192
|
||||
lxvd2x 7,0,3
|
||||
lxvd2x 40,0,5
|
||||
xxpermdi 34,34,34,2
|
||||
addi 3,10,208
|
||||
addi 5,10,224
|
||||
lxvd2x 41,0,7
|
||||
lxvd2x 8,0,31
|
||||
addi 7,10,240
|
||||
lxvd2x 9,0,12
|
||||
lxvd2x 42,0,11
|
||||
xxpermdi 37,37,37,2
|
||||
xxpermdi 36,36,36,2
|
||||
addi 6,6,32
|
||||
lxvd2x 32,0,3
|
||||
lxvd2x 43,0,5
|
||||
xxpermdi 6,6,6,2
|
||||
xxpermdi 38,38,38,2
|
||||
cmpd 7,8,6
|
||||
addi 10,10,256
|
||||
lxvd2x 10,0,7
|
||||
xxpermdi 39,39,39,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 10,10,10,2
|
||||
xvabssp 58,37
|
||||
xvabssp 59,39
|
||||
xvabssp 35,40
|
||||
xvabssp 60,41
|
||||
xvabssp 34,34
|
||||
xvabssp 33,33
|
||||
xvabssp 32,32
|
||||
xvabssp 61,43
|
||||
xvabssp 36,36
|
||||
xvabssp 37,6
|
||||
xvabssp 38,38
|
||||
xvabssp 39,7
|
||||
xvabssp 40,8
|
||||
xvabssp 41,9
|
||||
xvabssp 42,42
|
||||
xvabssp 43,10
|
||||
vperm 25,4,2,12
|
||||
vperm 4,4,2,13
|
||||
vperm 2,5,26,12
|
||||
vperm 5,5,26,13
|
||||
vperm 26,1,6,12
|
||||
vperm 6,1,6,13
|
||||
vperm 1,7,27,12
|
||||
vperm 7,7,27,13
|
||||
vperm 27,8,3,12
|
||||
vperm 8,8,3,13
|
||||
vperm 3,9,28,12
|
||||
vperm 9,9,28,13
|
||||
vperm 28,0,10,12
|
||||
vperm 10,0,10,13
|
||||
vperm 0,11,29,12
|
||||
vperm 11,11,29,13
|
||||
xvaddsp 12,33,39
|
||||
xvaddsp 38,58,38
|
||||
xvaddsp 0,32,43
|
||||
xvaddsp 42,60,42
|
||||
xvaddsp 36,57,36
|
||||
xvaddsp 37,34,37
|
||||
xvaddsp 40,59,40
|
||||
xvaddsp 41,35,41
|
||||
xvcmpgtsp 32,38,12
|
||||
xvcmpgtsp 33,42,0
|
||||
xvcmpgtsp 43,36,37
|
||||
xvcmpgtsp 39,40,41
|
||||
xxsel 12,38,12,32
|
||||
xxsel 38,48,49,32
|
||||
xxsel 0,42,0,33
|
||||
xxsel 42,48,49,33
|
||||
xxsel 37,36,37,43
|
||||
xxsel 43,62,47,43
|
||||
xxsel 41,40,41,39
|
||||
xxsel 39,62,47,39
|
||||
xvcmpgtsp 32,37,12
|
||||
xvcmpgtsp 33,41,0
|
||||
xxsel 12,37,12,32
|
||||
xxsel 43,43,38,32
|
||||
xxsel 0,41,0,33
|
||||
xxsel 33,39,42,33
|
||||
xvcmpgtsp 32,12,0
|
||||
vadduwm 1,1,31
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,5,0
|
||||
vadduwm 0,0,18
|
||||
vadduwm 18,18,14
|
||||
xxsel 51,51,32,33
|
||||
xxsel 5,5,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,5,5,3
|
||||
xxsldwi 12,5,5,2
|
||||
vspltw 0,19,3
|
||||
xxsldwi 0,5,5,1
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
mfvsrwz 6,32
|
||||
vspltw 0,19,2
|
||||
xscvspdp 0,0
|
||||
mfvsrwz 7,51
|
||||
mfvsrwz 5,32
|
||||
vspltw 0,19,0
|
||||
xscvspdp 5,5
|
||||
mfvsrwz 10,32
|
||||
fcmpu 7,11,12
|
||||
rldicl 3,6,0,32
|
||||
fmr 10,0
|
||||
rldicl 11,7,0,32
|
||||
rldicl 31,5,0,32
|
||||
rldicl 0,10,0,32
|
||||
beq 7,.L56
|
||||
bng 7,.L8
|
||||
fmr 11,12
|
||||
mr 3,31
|
||||
.L8:
|
||||
fcmpu 7,0,5
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L57
|
||||
bgt 7,.L58
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
add 4,4,10
|
||||
subf 10,8,9
|
||||
mtctr 10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
li 0,-128
|
||||
ld 31,-8(1)
|
||||
addi 3,3,1
|
||||
lvx 25,1,0
|
||||
li 0,-112
|
||||
lvx 26,1,0
|
||||
li 0,-96
|
||||
lvx 27,1,0
|
||||
li 0,-80
|
||||
lvx 28,1,0
|
||||
li 0,-64
|
||||
lvx 29,1,0
|
||||
li 0,-48
|
||||
lvx 30,1,0
|
||||
li 0,-32
|
||||
lvx 31,1,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bng 7,.L13
|
||||
fmr 10,5
|
||||
mr 11,0
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L57:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.p2align 4,,15
|
||||
.L58:
|
||||
fmr 11,10
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#if _CALL_ELF ==2
|
||||
.size icamin_k,.-icamin_k
|
||||
#endif
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.LC8:
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.long 32
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
385
kernel/power/icamin_power9.S
Normal file
385
kernel/power/icamin_power9.S
Normal file
@@ -0,0 +1,385 @@
|
||||
.file "icamin.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl icamin_k
|
||||
.type icamin_k, @function
|
||||
icamin_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
.localentry icamin_k,.-icamin_k
|
||||
mr. 9,3
|
||||
ble 0,.L25
|
||||
cmpdi 7,5,0
|
||||
li 3,0
|
||||
blelr 7
|
||||
lfs 11,0(4)
|
||||
lfs 0,4(4)
|
||||
cmpdi 7,5,1
|
||||
fabs 11,11
|
||||
fabs 0,0
|
||||
fadds 11,11,0
|
||||
beq 7,.L53
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L29
|
||||
addi 9,9,-1
|
||||
sldi 5,5,3
|
||||
li 3,0
|
||||
mtctr 9
|
||||
add 4,4,5
|
||||
li 9,1
|
||||
.p2align 4,,15
|
||||
.L24:
|
||||
lfs 0,4(4)
|
||||
lfs 12,0(4)
|
||||
add 4,4,5
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,0,11
|
||||
bnl 7,.L23
|
||||
fmr 11,0
|
||||
mr 3,9
|
||||
.L23:
|
||||
addi 9,9,1
|
||||
bdnz .L24
|
||||
.L51:
|
||||
addi 3,3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L25:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L53:
|
||||
rldicr. 8,9,0,58
|
||||
bne 0,.L54
|
||||
addi 7,8,1
|
||||
li 10,0
|
||||
subf 6,8,9
|
||||
li 3,0
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L43
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L43
|
||||
.p2align 4,,15
|
||||
.L44:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L46
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L46:
|
||||
addi 8,8,1
|
||||
bdnz .L44
|
||||
b .L51
|
||||
.p2align 4,,15
|
||||
.L54:
|
||||
xscvdpspn 9,11
|
||||
addis 11,2,.LC2@toc@ha
|
||||
addis 3,2,.LC3@toc@ha
|
||||
addis 5,2,.LC6@toc@ha
|
||||
addis 6,2,.LC7@toc@ha
|
||||
addis 7,2,.LC4@toc@ha
|
||||
addis 10,2,.LC5@toc@ha
|
||||
xxspltib 48,0
|
||||
addi 11,11,.LC2@toc@l
|
||||
addi 3,3,.LC3@toc@l
|
||||
addi 5,5,.LC6@toc@l
|
||||
stxv 59,-80(1)
|
||||
addi 6,6,.LC7@toc@l
|
||||
stxv 60,-64(1)
|
||||
stxv 63,-16(1)
|
||||
addi 7,7,.LC4@toc@l
|
||||
xxspltib 59,16
|
||||
lxv 44,0(11)
|
||||
xxspltib 60,32
|
||||
lxv 45,0(3)
|
||||
lxv 63,0(5)
|
||||
xxlor 47,48,48
|
||||
lxv 46,0(6)
|
||||
addi 10,10,.LC5@toc@l
|
||||
stxv 61,-48(1)
|
||||
stxv 62,-32(1)
|
||||
xxspltw 9,9,0
|
||||
lxv 61,0(7)
|
||||
lxv 62,0(10)
|
||||
li 7,0
|
||||
mr 10,4
|
||||
vextsb2w 27,27
|
||||
vextsb2w 28,28
|
||||
stxv 57,-112(1)
|
||||
stxv 58,-96(1)
|
||||
.p2align 4,,15
|
||||
.L5:
|
||||
lxv 0,0(10)
|
||||
addi 7,7,32
|
||||
addi 10,10,256
|
||||
cmpd 7,8,7
|
||||
xvabssp 34,0
|
||||
lxv 0,-240(10)
|
||||
xvabssp 42,0
|
||||
lxv 0,-224(10)
|
||||
xvabssp 49,0
|
||||
lxv 0,-208(10)
|
||||
vpermr 26,10,2,12
|
||||
vpermr 2,10,2,13
|
||||
xvabssp 35,0
|
||||
lxv 0,-192(10)
|
||||
xvaddsp 34,58,34
|
||||
xvabssp 36,0
|
||||
lxv 0,-176(10)
|
||||
vpermr 10,3,17,12
|
||||
vpermr 3,3,17,13
|
||||
xvabssp 33,0
|
||||
lxv 0,-160(10)
|
||||
xvaddsp 10,42,35
|
||||
xvabssp 50,0
|
||||
lxv 0,-144(10)
|
||||
vpermr 17,1,4,12
|
||||
vpermr 4,1,4,13
|
||||
xvabssp 37,0
|
||||
lxv 0,-128(10)
|
||||
xvaddsp 36,49,36
|
||||
xvabssp 38,0
|
||||
lxv 0,-112(10)
|
||||
vpermr 1,5,18,12
|
||||
vpermr 5,5,18,13
|
||||
xvabssp 43,0
|
||||
lxv 0,-96(10)
|
||||
xvaddsp 12,33,37
|
||||
xvabssp 51,0
|
||||
lxv 0,-80(10)
|
||||
vpermr 18,11,6,12
|
||||
vpermr 6,11,6,13
|
||||
xvabssp 39,0
|
||||
lxv 0,-64(10)
|
||||
xvaddsp 38,50,38
|
||||
xvabssp 40,0
|
||||
lxv 0,-48(10)
|
||||
vpermr 11,7,19,12
|
||||
vpermr 7,7,19,13
|
||||
xvabssp 32,0
|
||||
lxv 0,-32(10)
|
||||
xvaddsp 11,43,39
|
||||
xvcmpgtsp 39,34,10
|
||||
xvcmpgtsp 43,36,12
|
||||
xvabssp 57,0
|
||||
lxv 0,-16(10)
|
||||
vpermr 19,0,8,12
|
||||
vpermr 8,0,8,13
|
||||
xxsel 10,34,10,39
|
||||
xxsel 12,36,12,43
|
||||
xxsel 39,61,62,39
|
||||
xxsel 43,63,46,43
|
||||
xvabssp 41,0
|
||||
xvaddsp 40,51,40
|
||||
vpermr 0,9,25,12
|
||||
vpermr 9,9,25,13
|
||||
xvaddsp 0,32,41
|
||||
xvcmpgtsp 41,38,11
|
||||
xvcmpgtsp 32,10,12
|
||||
xvcmpgtsp 42,40,0
|
||||
xxsel 11,38,11,41
|
||||
xxsel 12,10,12,32
|
||||
xxsel 43,39,43,32
|
||||
xxsel 41,61,62,41
|
||||
xxsel 0,40,0,42
|
||||
xxsel 42,63,46,42
|
||||
xvcmpgtsp 33,11,0
|
||||
xxsel 0,11,0,33
|
||||
xxsel 33,41,42,33
|
||||
xvcmpgtsp 32,12,0
|
||||
vadduwm 1,1,27
|
||||
xxsel 0,12,0,32
|
||||
xxsel 32,43,33,32
|
||||
xvcmpgtsp 33,9,0
|
||||
vadduwm 0,0,15
|
||||
vadduwm 15,15,28
|
||||
xxsel 48,48,32,33
|
||||
xxsel 9,9,0,33
|
||||
bgt 7,.L5
|
||||
xxsldwi 11,9,9,3
|
||||
xxsldwi 12,9,9,2
|
||||
li 10,0
|
||||
li 3,12
|
||||
xxsldwi 0,9,9,1
|
||||
xscvspdp 9,9
|
||||
vextuwrx 6,10,16
|
||||
li 10,4
|
||||
xscvspdp 11,11
|
||||
xscvspdp 12,12
|
||||
xscvspdp 0,0
|
||||
vextuwrx 5,10,16
|
||||
li 10,8
|
||||
vextuwrx 7,10,16
|
||||
vextuwrx 10,3,16
|
||||
rldicl 12,5,0,32
|
||||
rldicl 3,6,0,32
|
||||
rldicl 11,7,0,32
|
||||
rldicl 0,10,0,32
|
||||
fcmpu 7,11,12
|
||||
fmr 10,0
|
||||
beq 7,.L55
|
||||
bng 7,.L8
|
||||
mr 3,12
|
||||
fmr 11,12
|
||||
.L8:
|
||||
fcmpu 7,0,9
|
||||
bne 7,.L11
|
||||
cmplw 7,7,10
|
||||
ble 7,.L12
|
||||
mr 7,10
|
||||
.L12:
|
||||
rldicl 11,7,0,32
|
||||
.L13:
|
||||
fcmpu 7,11,10
|
||||
beq 7,.L56
|
||||
bng 7,.L17
|
||||
mr 3,11
|
||||
fmr 11,10
|
||||
.L17:
|
||||
cmpd 7,9,8
|
||||
ble 7,.L19
|
||||
addi 7,8,1
|
||||
sldi 10,8,1
|
||||
subf 6,8,9
|
||||
cmpd 7,7,9
|
||||
sldi 10,10,2
|
||||
mtctr 6
|
||||
add 4,4,10
|
||||
bgt 7,.L37
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L37
|
||||
.p2align 4,,15
|
||||
.L21:
|
||||
lfs 0,0(4)
|
||||
lfs 12,4(4)
|
||||
addi 4,4,8
|
||||
fabs 0,0
|
||||
fabs 12,12
|
||||
fadds 0,0,12
|
||||
fcmpu 7,11,0
|
||||
bng 7,.L20
|
||||
fmr 11,0
|
||||
mr 3,8
|
||||
.L20:
|
||||
addi 8,8,1
|
||||
bdnz .L21
|
||||
.L19:
|
||||
lxv 57,-112(1)
|
||||
lxv 58,-96(1)
|
||||
addi 3,3,1
|
||||
lxv 59,-80(1)
|
||||
lxv 60,-64(1)
|
||||
lxv 61,-48(1)
|
||||
lxv 62,-32(1)
|
||||
lxv 63,-16(1)
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L55:
|
||||
cmplw 7,6,5
|
||||
ble 7,.L7
|
||||
mr 6,5
|
||||
.L7:
|
||||
rldicl 3,6,0,32
|
||||
b .L8
|
||||
.p2align 4,,15
|
||||
.L29:
|
||||
li 3,1
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
bng 7,.L13
|
||||
mr 11,0
|
||||
fmr 10,9
|
||||
b .L13
|
||||
.p2align 4,,15
|
||||
.L56:
|
||||
cmpd 7,3,11
|
||||
ble 7,.L17
|
||||
mr 3,11
|
||||
b .L17
|
||||
.L37:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L21
|
||||
.L43:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L44
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size icamin_k,.-icamin_k
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.byte 16
|
||||
.byte 17
|
||||
.byte 18
|
||||
.byte 19
|
||||
.byte 24
|
||||
.byte 25
|
||||
.byte 26
|
||||
.byte 27
|
||||
.LC3:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 20
|
||||
.byte 21
|
||||
.byte 22
|
||||
.byte 23
|
||||
.byte 28
|
||||
.byte 29
|
||||
.byte 30
|
||||
.byte 31
|
||||
.LC4:
|
||||
.long 0
|
||||
.long 1
|
||||
.long 2
|
||||
.long 3
|
||||
.LC5:
|
||||
.long 4
|
||||
.long 5
|
||||
.long 6
|
||||
.long 7
|
||||
.LC6:
|
||||
.long 8
|
||||
.long 9
|
||||
.long 10
|
||||
.long 11
|
||||
.LC7:
|
||||
.long 12
|
||||
.long 13
|
||||
.long 14
|
||||
.long 15
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user