Merge pull request #2398 from xianyi/develop
Update from develop in preparation of the 0.3.8 release
This commit is contained in:
commit
fb5eb47558
19
.travis.yml
19
.travis.yml
|
@ -17,7 +17,7 @@ matrix:
|
|||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
script:
|
||||
- set -e
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
- make -C test $COMMON_FLAGS $BTYPE
|
||||
- make -C ctest $COMMON_FLAGS $BTYPE
|
||||
- make -C utest $COMMON_FLAGS $BTYPE
|
||||
|
@ -160,18 +160,25 @@ matrix:
|
|||
os: osx
|
||||
osx_image: xcode10.1
|
||||
before_script:
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
|
||||
- COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
|
||||
- brew update
|
||||
- brew install gcc # for gfortran
|
||||
- brew install gcc@8 # for gfortran
|
||||
script:
|
||||
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
|
||||
env:
|
||||
- BTYPE="BINARY=64 INTERFACE64=1"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode8.3
|
||||
osx_image: xcode10.0
|
||||
env:
|
||||
- BTYPE="BINARY=32"
|
||||
- BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
|
||||
|
||||
- <<: *test-macos
|
||||
osx_image: xcode10.1
|
||||
env:
|
||||
- CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
|
||||
- CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
|
||||
- BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
|
||||
|
||||
# whitelist
|
||||
branches:
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 7)
|
||||
set(OpenBLAS_PATCH_VERSION 8)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
|
|
@ -171,3 +171,12 @@ In chronological order:
|
|||
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
|
||||
* [2019-03-14] power9 dgemm/dtrmm kernel
|
||||
* [2019-04-29] power9 sgemm/strmm kernel
|
||||
|
||||
* Jiachen Wang <https://github.com/wjc404>
|
||||
* [2019-07-29] optimize AVX2 DGEMM
|
||||
* [2019-10-20] AVX512 DGEMM kernel (4x8)
|
||||
* [2019-11-06] optimize AVX512 SGEMM
|
||||
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
|
||||
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
|
||||
* [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
|
||||
* [2020-01-07] optimize AVX2 SGEMM and STRMM
|
||||
|
|
120
Changelog.txt
120
Changelog.txt
|
@ -1,46 +1,100 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.8
|
||||
9-Feb-2020
|
||||
|
||||
common:
|
||||
` * LAPACK has been updated to 3.9.0 (plus patches up to
|
||||
January 2nd, 2020)
|
||||
* CMAKE support has been improved in several areas including
|
||||
cross-compilation
|
||||
* a thread race condition in the GEMM3M kernels was resolved
|
||||
* the "generic" (plain C) gemm beta kernel used by many targets
|
||||
has been sped up
|
||||
* an optimized version of the LAPACK trtrs functions has been added
|
||||
* an incompatibilty between the LAPACK tests and the OpenBLAS
|
||||
implementation of XERBLA was resolved, removing the numerous
|
||||
warnings about wrong error exits in the former
|
||||
* support for NetBSD has been added
|
||||
* support for compilation with g95 and non-GNU versions of ld
|
||||
has been improved
|
||||
* support for compilation with (upcoming) gcc 10 has been added
|
||||
|
||||
POWER:
|
||||
* worked around miscompilation of several POWER8 and POWER9
|
||||
kernels by older versions of gcc
|
||||
* added support for big-endian POWER8 and for compilation on AIX
|
||||
* corrected bugs in the big-endian support for PPC440 and PPC970
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
|
||||
ARMV8:
|
||||
* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
|
||||
* compilation for 32bit works again
|
||||
* performance of the RPCC function has been improved
|
||||
* improved performance on small systems
|
||||
* DYNAMIC_ARCH support is now available in CMAKE builds as well
|
||||
* cross-compilation from OSX to IOS was simplified
|
||||
|
||||
x86_64:
|
||||
* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
|
||||
was significantly improved
|
||||
* optimized AVX512 kernels for CGEMM and ZGEMM have been added
|
||||
* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
|
||||
sped up and optimized CGEMM3M and ZGEMM3M kernels have been added
|
||||
* added support for QEMU virtual cpus
|
||||
* a compilation problem with PGI and SUN compilers was fixed
|
||||
* Intel "Goldmont plus" is now autodetected
|
||||
* a potential crash on program exit on MS Windows has been fixed
|
||||
|
||||
x86:
|
||||
* an unwanted case sensitivity in the implementation of LSAME
|
||||
on older 32bit AMD cpus was fixed
|
||||
|
||||
zarch:
|
||||
* Z15 is now supported as Z14
|
||||
* DYNAMIC_ARCH is now available on ZARCH as well
|
||||
|
||||
====================================================================
|
||||
Version 0.3.7
|
||||
11-Aug 2019
|
||||
|
||||
common:
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
* having the gmake special variables TARGET_ARCH or TARGET_MACH
|
||||
defined no longer causes build failures in ctest or utest
|
||||
* defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
|
||||
has the same effect as setting them to 1
|
||||
* a new test program was added to allow checking the library for
|
||||
thread safety
|
||||
* a new option USE_LOCKING was added to ensure thread safety when
|
||||
OpenBLAS itself is built without multithreading but will be
|
||||
called from multiple threads.
|
||||
* a build failure on Linux with glibc versions earlier than 2.5
|
||||
was fixed
|
||||
* a runtime error with CPU enumeration (and NO_AFFINITY not set)
|
||||
on glibc 2.6 was fixed
|
||||
* NO_AFFINITY was added to the CMAKE options (and defaults to being
|
||||
active on Linux, as in the gmake builds)
|
||||
|
||||
x86_64:
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
* the build-time logic for detection of AVX512 availability in
|
||||
the processor and compiler was fixed
|
||||
* gmake builds on OSX now set the internal name of the library to
|
||||
libopenblas.0.dylib (consistent with CMAKE)
|
||||
* the Haswell DGEMM kernel received a significant speedup through
|
||||
improved prefetch and load instructions
|
||||
* performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
|
||||
increased by avoiding vpermpd instructions
|
||||
* the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
|
||||
to fix remaining errors in DGEMM, DSYMM and DTRMM
|
||||
|
||||
## POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 single and double precision complex BLAS3
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
POWER:
|
||||
* added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
|
||||
* added optimized kernels for POWER9 SGEMM and STRMM
|
||||
|
||||
## ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
ARMV7:
|
||||
* fixed the softfp implementations of xAMAX and IxAMAX
|
||||
* removed the predefined -march= flags on both ARMV5 and ARMV6 as
|
||||
they were appropriate for only a subset of platforms
|
||||
|
||||
====================================================================
|
||||
Version 0.3.6
|
||||
|
|
20
Makefile
20
Makefile
|
@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild
|
|||
|
||||
lapack_prebuild :
|
||||
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
||||
-@echo "FORTRAN = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "OPTS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LOADOPTS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LDFLAGS = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "CC = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override CFLAGS = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "override ARCH = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARCHFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "AR = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "ARFLAGS = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "RANLIB = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "TMGLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "BLASLIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKELIB = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "SUFFIX = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
-@echo "PSUFFIX = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
|
||||
|
@ -319,7 +319,7 @@ lapack-test :
|
|||
ifneq ($(CROSS), 1)
|
||||
( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
|
||||
./testsecond; ./testdsecnd; ./testieee; ./testversion )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
|
||||
(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
|
||||
endif
|
||||
|
||||
lapack-runtest:
|
||||
|
|
|
@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
|||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
|
@ -51,6 +51,7 @@ endif
|
|||
ifneq ($(OSNAME), AIX)
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
|
@ -100,6 +101,7 @@ else
|
|||
#install on AIX has different options syntax
|
||||
ifndef NO_LAPACKE
|
||||
@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
|
||||
@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.7
|
||||
VERSION = 0.3.8
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -9,9 +9,11 @@ ifndef TOPDIR
|
|||
TOPDIR = .
|
||||
endif
|
||||
|
||||
# If ARCH is not set, we use the host system's architecture.
|
||||
# If ARCH is not set, we use the host system's architecture for getarch compile options.
|
||||
ifndef ARCH
|
||||
ARCH := $(shell uname -m)
|
||||
HOSTARCH := $(shell uname -m)
|
||||
else
|
||||
HOSTARCH = $(ARCH)
|
||||
endif
|
||||
|
||||
# Catch conflicting usage of ARCH in some BSD environments
|
||||
|
@ -23,6 +25,8 @@ else ifeq ($(ARCH), i386)
|
|||
override ARCH=x86
|
||||
else ifeq ($(ARCH), aarch64)
|
||||
override ARCH=arm64
|
||||
else ifeq ($(ARCH), zarch)
|
||||
override ARCH=zarch
|
||||
endif
|
||||
|
||||
NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
|
||||
|
@ -142,9 +146,9 @@ endif
|
|||
endif
|
||||
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(ARCH), x86_64)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
@ -320,12 +324,14 @@ CCOMMON_OPT += -DMS_ABI
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
#Test for supporting MS_ABI
|
||||
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
|
||||
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
|
||||
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
|
||||
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
|
||||
ifeq ($(GCCVERSIONGT4), 1)
|
||||
# GCC Majar version > 4
|
||||
# GCC Major version > 4
|
||||
# It is compatible with MSVC ABI.
|
||||
CCOMMON_OPT += -DMS_ABI
|
||||
endif
|
||||
|
@ -544,16 +550,35 @@ endif
|
|||
|
||||
ifeq ($(ARCH), arm64)
|
||||
DYNAMIC_CORE = ARMV8
|
||||
DYNAMIC_CORE += CORTEXA53
|
||||
DYNAMIC_CORE += CORTEXA57
|
||||
DYNAMIC_CORE += CORTEXA72
|
||||
DYNAMIC_CORE += CORTEXA73
|
||||
DYNAMIC_CORE += FALKOR
|
||||
DYNAMIC_CORE += THUNDERX
|
||||
DYNAMIC_CORE += THUNDERX2T99
|
||||
DYNAMIC_CORE += TSV110
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = Z13
|
||||
DYNAMIC_CORE += Z14
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
ifeq ($(GCCVERSIONGT5), 1)
|
||||
DYNAMIC_CORE += POWER9
|
||||
else
|
||||
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
ifndef DYNAMIC_CORE
|
||||
|
@ -697,7 +722,7 @@ endif
|
|||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
ifdef BINARY64
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
else
|
||||
CCOMMON_OPT += -tp p7
|
||||
endif
|
||||
|
@ -757,6 +782,9 @@ else
|
|||
FCOMMON_OPT += -m32
|
||||
endif
|
||||
endif
|
||||
ifneq ($(NO_LAPACKE), 1)
|
||||
FCOMMON_OPT += -fno-second-underscore
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
|
|
43
README.md
43
README.md
|
@ -26,6 +26,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
|
|||
|
||||
Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
|
||||
using Git from https://github.com/xianyi/OpenBLAS.git.
|
||||
Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
|
||||
Most can also be given directly on the make or cmake command line.
|
||||
|
||||
### Dependencies
|
||||
|
||||
|
@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
|
|||
|
||||
## Supported CPUs and Operating Systems
|
||||
|
||||
Please read `GotoBLAS_01Readme.txt`.
|
||||
Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
|
||||
|
||||
### Additional supported CPUs
|
||||
|
||||
|
@ -109,8 +111,8 @@ Please read `GotoBLAS_01Readme.txt`.
|
|||
|
||||
- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
|
||||
- **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
|
||||
- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
|
||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||
- **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
|
||||
- **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
|
||||
|
@ -129,8 +131,15 @@ Please read `GotoBLAS_01Readme.txt`.
|
|||
|
||||
#### ARM64
|
||||
|
||||
- **ARMv8**: Experimental
|
||||
- **ARM Cortex-A57**: Experimental
|
||||
- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
|
||||
- **Cortex-A53**: same as ARMV8 (different cpu specifications)
|
||||
- **Cortex A57**: Optimized Level-3 and Level-2 functions
|
||||
- **Cortex A72**: same as A57 ( different cpu specifications)
|
||||
- **Cortex A73**: same as A57 (different cpu specifications)
|
||||
- **Falkor**: same as A57 (different cpu specifications)
|
||||
- **ThunderX**: Optimized some Level-1 functions
|
||||
- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
|
||||
- **TSV110**: Optimized some Level-3 helper functions
|
||||
|
||||
#### PPC/PPC64
|
||||
|
||||
|
@ -139,18 +148,34 @@ Please read `GotoBLAS_01Readme.txt`.
|
|||
|
||||
#### IBM zEnterprise System
|
||||
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
|
||||
- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
|
||||
- **Z13**: Optimized Level-3 BLAS and Level-1,2
|
||||
- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
|
||||
For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
|
||||
DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
|
||||
Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
|
||||
On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
|
||||
For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
|
||||
The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
|
||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||
|
||||
### Supported OS
|
||||
|
||||
- **GNU/Linux**
|
||||
- **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
|
||||
- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
|
||||
- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
|
||||
- **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
|
||||
## Usage
|
||||
|
||||
|
@ -205,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
|
|||
* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
|
||||
Clang 3.0 will generate the wrong AVX binary code.
|
||||
* Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
|
||||
* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
|
||||
there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
|
||||
the library with `BIGNUMA=1`.
|
||||
* OpenBLAS does not set processor affinity by default.
|
||||
|
|
|
@ -38,7 +38,8 @@ environment:
|
|||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
WITH_FORTRAN: ignore
|
||||
- COMPILER: MinGW64-gcc-7.2.0
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-6.3.0-32
|
||||
- APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
|
||||
COMPILER: MinGW-gcc-5.3.0
|
||||
WITH_FORTRAN: ignore
|
||||
|
@ -62,10 +63,10 @@ before_build:
|
|||
- set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
|
||||
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
|
|
|
@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
|
|||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
|
|||
fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
|
||||
for(j = 0; j < m; j++){
|
||||
for(i = 0; i < n * COMPSIZE; i++){
|
||||
a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
|
|||
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
|
||||
for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
|
||||
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
|
|
25
c_check
25
c_check
|
@ -188,13 +188,13 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
|
||||
} else {
|
||||
$tmpf = new File::Temp( UNLINK => 1 );
|
||||
$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"addvi.b $w0, $w1, 1"';
|
||||
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
|
||||
print $tmpf "#include <msa.h>\n\n";
|
||||
print $tmpf "void main(void){ __asm__ volatile($code); }\n";
|
||||
|
||||
$args = "$msa_flags -o $tmpf.o -x c $tmpf";
|
||||
$args = "$msa_flags -o $tmpf.o $tmpf";
|
||||
my @cmd = ("$compiler_name $args");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
|
@ -229,10 +229,13 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
|
|||
$no_avx512 = 0;
|
||||
} else {
|
||||
# $tmpf = new File::Temp( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( UNLINK => 1 );
|
||||
($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
|
||||
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
|
||||
$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
|
||||
if ($compiler eq "PGI") {
|
||||
$args = " -tp skylake -c -o $tmpf.o $tmpf";
|
||||
}
|
||||
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
|
@ -260,6 +263,19 @@ if ($architecture ne $hostarch) {
|
|||
|
||||
$cross = 1 if ($os ne $hostos);
|
||||
|
||||
# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
|
||||
# the initial autodetection will have been confused by the command-line arguments to clang
|
||||
# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
|
||||
if (($os eq "Darwin") && ($cross_suffix ne "")) {
|
||||
my $tmpnam = `xcrun --sdk iphoneos --find clang`;
|
||||
$cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 );
|
||||
# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
|
||||
$cross =1;
|
||||
$architecture = arm64;
|
||||
}
|
||||
|
||||
|
||||
|
||||
$openmp = "" if $ENV{USE_OPENMP} != 1;
|
||||
|
||||
$linker_L = "";
|
||||
|
@ -305,6 +321,7 @@ $linker_a = "";
|
|||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
) {
|
||||
$linker_l .= $flags . " "
|
||||
}
|
||||
|
|
|
@ -45,7 +45,11 @@ endif ()
|
|||
|
||||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
|
||||
endif ()
|
||||
|
||||
if (X86)
|
||||
|
@ -73,7 +77,7 @@ if (DYNAMIC_ARCH)
|
|||
endif ()
|
||||
if (NOT NO_AVX512)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
|
||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets C related variables.
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
|
||||
set(COMMON_PROF "${COMMON_PROF} -fno-inline")
|
||||
|
@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
|
||||
else ()
|
||||
|
@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
|
||||
if (BINARY64)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
|
||||
else ()
|
||||
|
@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
|
||||
|
||||
if (MIPS64)
|
||||
|
||||
|
@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -w")
|
||||
if (X86)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
|
||||
|
@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER} STREQUAL "SUN")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL "SKYLAKEX")
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (NOT NO_AVX512)
|
||||
set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
|
|
@ -115,7 +115,9 @@ set(SLASRC
|
|||
stplqt.f stplqt2.f stpmlqt.f
|
||||
ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
|
||||
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
|
||||
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
|
||||
scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
|
||||
slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
|
||||
|
||||
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
|
||||
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
|
||||
|
@ -210,7 +212,9 @@ set(CLASRC
|
|||
ctplqt.f ctplqt2.f ctpmlqt.f
|
||||
chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
|
||||
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
|
||||
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
|
||||
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
|
||||
cungtsqr.f cunhr_col.f )
|
||||
|
||||
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
|
||||
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
|
||||
|
@ -299,7 +303,9 @@ set(DLASRC
|
|||
dtplqt.f dtplqt2.f dtpmlqt.f
|
||||
dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
|
||||
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
|
||||
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
|
||||
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
|
||||
dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
|
||||
|
||||
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
|
||||
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
|
||||
|
@ -398,7 +404,9 @@ set(ZLASRC
|
|||
zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
|
||||
zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
|
||||
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
|
||||
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
|
||||
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
|
||||
zungtsqr.f zunhr_col.f)
|
||||
|
||||
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
|
||||
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
|
||||
|
|
|
@ -715,6 +715,8 @@ set(DSRC
|
|||
lapacke_dgesv_work.c
|
||||
lapacke_dgesvd.c
|
||||
lapacke_dgesvd_work.c
|
||||
lapacke_dgesvdq.c
|
||||
lapacke_dgesvdq_work.c
|
||||
lapacke_dgesvdx.c
|
||||
lapacke_dgesvdx_work.c
|
||||
lapacke_dgesvj.c
|
||||
|
@ -1287,6 +1289,8 @@ set(SSRC
|
|||
lapacke_sgesv_work.c
|
||||
lapacke_sgesvd.c
|
||||
lapacke_sgesvd_work.c
|
||||
lapacke_sgesvdq.c
|
||||
lapacke_sgesvdq_work.c
|
||||
lapacke_sgesvdx.c
|
||||
lapacke_sgesvdx_work.c
|
||||
lapacke_sgesvj.c
|
||||
|
@ -1853,6 +1857,8 @@ set(ZSRC
|
|||
lapacke_zgesv_work.c
|
||||
lapacke_zgesvd.c
|
||||
lapacke_zgesvd_work.c
|
||||
lapacke_zgesvdq.c
|
||||
lapacke_zgesvdq_work.c
|
||||
lapacke_zgesvdx.c
|
||||
lapacke_zgesvdx_work.c
|
||||
lapacke_zgesvj.c
|
||||
|
|
|
@ -105,8 +105,39 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
# Perhaps this should be inside a different file as it grows larger
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ${TCORE}\n"
|
||||
"#define CORE_${TCORE}\n"
|
||||
"#define CHAR_CORENAME \"${TCORE}\"\n")
|
||||
if ("${TCORE}" STREQUAL "ARMV7")
|
||||
if ("${TCORE}" STREQUAL "CORE2")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L2_SIZE\t1048576\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t256\n"
|
||||
"#define DTB_SIZE\t4096\n"
|
||||
"#define HAVE_CMOV\n"
|
||||
"#define HAVE_MMX\n"
|
||||
"#define HAVE_SSE\n"
|
||||
"#define HAVE_SSE2\n"
|
||||
"#define HAVE_SSE3\n"
|
||||
"#define HAVE_SSSE3\n"
|
||||
"#define SLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define DLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define CLOCAL_BUFFER_SIZE\t16384\n"
|
||||
"#define ZLOCAL_BUFFER_SIZE\t16384\n")
|
||||
set(SGEMM_UNROLL_M 8)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 4)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(CGEMM3M_UNROLL_M 8)
|
||||
set(CGEMM3M_UNROLL_N 4)
|
||||
set(ZGEMM3M_UNROLL_M 4)
|
||||
set(ZGEMM3M_UNROLL_N 4)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV7")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t32\n"
|
||||
|
@ -121,6 +152,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 2)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
elseif ("${TCORE}" STREQUAL "ARMV8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE\t32768\n"
|
||||
|
@ -274,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
|
|||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "TSV110")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define ARMV8\n"
|
||||
"#define L1_CODE_SIZE\t65536\n"
|
||||
"#define L1_CODE_LINESIZE\t64\n"
|
||||
"#define L1_CODE_ASSOCIATIVE\t4\n"
|
||||
"#define L1_DATA_SIZE\t65536\n"
|
||||
"#define L1_DATA_LINESIZE\t64\n"
|
||||
"#define L1_DATA_ASSOCIATIVE\t4\n"
|
||||
"#define L2_SIZE\t524288\n"
|
||||
"#define L2_LINESIZE\t64\n"
|
||||
"#define L2_ASSOCIATIVE\t8\n"
|
||||
"#define DTB_DEFAULT_ENTRIES\t64\n"
|
||||
"#define DTB_SIZE\t4096\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 8)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 4)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 16)
|
||||
elseif ("${TCORE}" STREQUAL "POWER6")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 4)
|
||||
set(SGEMM_UNROLL_N 4)
|
||||
set(DGEMM_UNROLL_M 4)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 2)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 2)
|
||||
set(ZGEMM_UNROLL_N 4)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER8")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
elseif ("${TCORE}" STREQUAL "POWER9")
|
||||
file(APPEND ${TARGET_CONF_TEMP}
|
||||
"#define L1_DATA_SIZE 32768\n"
|
||||
"#define L1_DATA_LINESIZE 128\n"
|
||||
"#define L2_SIZE 524288\n"
|
||||
"#define L2_LINESIZE 128 \n"
|
||||
"#define DTB_DEFAULT_ENTRIES 128\n"
|
||||
"#define DTB_SIZE 4096\n"
|
||||
"#define L2_ASSOCIATIVE 8\n")
|
||||
set(SGEMM_UNROLL_M 16)
|
||||
set(SGEMM_UNROLL_N 8)
|
||||
set(DGEMM_UNROLL_M 16)
|
||||
set(DGEMM_UNROLL_N 4)
|
||||
set(CGEMM_UNROLL_M 8)
|
||||
set(CGEMM_UNROLL_N 4)
|
||||
set(ZGEMM_UNROLL_M 8)
|
||||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
endif()
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
|
@ -309,6 +421,9 @@ else(NOT CMAKE_CROSSCOMPILING)
|
|||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
|
||||
else()
|
||||
list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
|
||||
if (DEFINED TARGET_CORE)
|
||||
set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
|
||||
|
|
|
@ -66,7 +66,7 @@ if (DEFINED TARGET)
|
|||
endif ()
|
||||
|
||||
# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
|
||||
if (X86_64)
|
||||
if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
|
||||
set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
|
|||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
||||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(X86_64 1)
|
||||
else()
|
||||
set(X86 1)
|
||||
endif()
|
||||
else()
|
||||
set(X86 1)
|
||||
if (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
|
@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
|
|||
else()
|
||||
set(ARM 1)
|
||||
endif()
|
||||
elseif (${CMAKE_CROSSCOMPILING})
|
||||
if (${TARGET} STREQUAL "CORE2")
|
||||
if (NOT BINARY)
|
||||
set(X86 1)
|
||||
elseif (${BINARY} EQUAL "64")
|
||||
set(X86_64 1)
|
||||
else ()
|
||||
set(X86 1)
|
||||
endif()
|
||||
elseif (${TARGET} STREQUAL "ARMV7")
|
||||
set(ARM 1)
|
||||
else()
|
||||
set(ARM64 1)
|
||||
endif ()
|
||||
else ()
|
||||
message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
|
||||
endif()
|
||||
|
||||
if (X86_64)
|
||||
|
@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
|
|||
endif()
|
||||
file(REMOVE "avx512.tmp" "avx512.o")
|
||||
endif()
|
||||
|
||||
|
|
|
@ -78,7 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
|
||||
#define BLAS_LOCK_DEFINED
|
||||
|
||||
#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
|
||||
static __inline BLASULONG rpcc(void){
|
||||
BLASULONG ret = 0;
|
||||
|
||||
__asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define RPCC_DEFINED
|
||||
#define RPCC64BIT
|
||||
#endif
|
||||
|
||||
static inline int blas_quickdivide(blasint x, blasint y){
|
||||
return x / y;
|
||||
|
@ -103,12 +114,16 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
|
||||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
.macro PROLOGUE
|
||||
.text ;
|
||||
.p2align 2 ;
|
||||
.global REALNAME ;
|
||||
#ifndef __APPLE__
|
||||
.type REALNAME, %function ;
|
||||
#endif
|
||||
REALNAME:
|
||||
.endm
|
||||
|
||||
|
||||
#define EPILOGUE
|
||||
|
||||
|
|
146
common_lapack.h
146
common_lapack.h
|
@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
|
|||
blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
|
||||
blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
#endif
|
||||
|
|
165
common_macro.h
165
common_macro.h
|
@ -641,7 +641,7 @@
|
|||
#define IMATCOPY_K_CT DIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT DIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K DGEADD_K
|
||||
#define GEADD_K DGEADD_K
|
||||
#else
|
||||
|
||||
#define AMAX_K SAMAX_K
|
||||
|
@ -944,7 +944,7 @@
|
|||
#define IMATCOPY_K_CT SIMATCOPY_K_CT
|
||||
#define IMATCOPY_K_RT SIMATCOPY_K_RT
|
||||
|
||||
#define GEADD_K SGEADD_K
|
||||
#define GEADD_K SGEADD_K
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
|
@ -1770,7 +1770,7 @@
|
|||
#define IMATCOPY_K_CTC ZIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC ZIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K ZGEADD_K
|
||||
#define GEADD_K ZGEADD_K
|
||||
|
||||
#else
|
||||
|
||||
|
@ -2193,7 +2193,7 @@
|
|||
#define IMATCOPY_K_CTC CIMATCOPY_K_CTC
|
||||
#define IMATCOPY_K_RTC CIMATCOPY_K_RTC
|
||||
|
||||
#define GEADD_K CGEADD_K
|
||||
#define GEADD_K CGEADD_K
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
@ -2806,3 +2806,160 @@ typedef struct {
|
|||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE qtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE qtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE qtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE qtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE qtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE qtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE qtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE qtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL qtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL qtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL qtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL qtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL qtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL qtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL qtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL qtrtrs_LTN_parallel
|
||||
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE dtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE dtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE dtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE dtrtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE dtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE dtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE dtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE dtrtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL dtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL dtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL dtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL dtrtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL dtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL dtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL dtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL dtrtrs_LTN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE strtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE strtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE strtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE strtrs_UTN_single
|
||||
#define TRTRS_LNU_SINGLE strtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE strtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE strtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE strtrs_LTN_single
|
||||
#define TRTRS_UNU_PARALLEL strtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL strtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL strtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL strtrs_UTN_parallel
|
||||
#define TRTRS_LNU_PARALLEL strtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL strtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL strtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL strtrs_LTN_parallel
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define TRTRS_UNU_SINGLE xtrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE xtrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE xtrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE xtrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE xtrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE xtrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE xtrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE xtrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE xtrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE xtrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE xtrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE xtrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE xtrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE xtrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE xtrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE xtrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL xtrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL xtrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL xtrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL xtrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL xtrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL xtrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL xtrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL xtrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL xtrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL xtrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL xtrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL xtrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL xtrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL xtrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL xtrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL xtrtrs_LCN_parallel
|
||||
#elif defined(DOUBLE)
|
||||
#define TRTRS_UNU_SINGLE ztrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ztrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ztrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ztrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ztrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ztrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ztrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ztrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ztrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ztrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ztrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ztrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ztrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ztrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ztrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ztrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ztrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ztrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ztrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ztrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ztrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ztrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ztrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ztrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ztrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ztrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ztrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ztrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ztrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ztrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ztrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ztrtrs_LCN_parallel
|
||||
#else
|
||||
#define TRTRS_UNU_SINGLE ctrtrs_UNU_single
|
||||
#define TRTRS_UNN_SINGLE ctrtrs_UNN_single
|
||||
#define TRTRS_UTU_SINGLE ctrtrs_UTU_single
|
||||
#define TRTRS_UTN_SINGLE ctrtrs_UTN_single
|
||||
#define TRTRS_URU_SINGLE ctrtrs_URU_single
|
||||
#define TRTRS_URN_SINGLE ctrtrs_URN_single
|
||||
#define TRTRS_UCU_SINGLE ctrtrs_UCU_single
|
||||
#define TRTRS_UCN_SINGLE ctrtrs_UCN_single
|
||||
#define TRTRS_LNU_SINGLE ctrtrs_LNU_single
|
||||
#define TRTRS_LNN_SINGLE ctrtrs_LNN_single
|
||||
#define TRTRS_LTU_SINGLE ctrtrs_LTU_single
|
||||
#define TRTRS_LTN_SINGLE ctrtrs_LTN_single
|
||||
#define TRTRS_LRU_SINGLE ctrtrs_LRU_single
|
||||
#define TRTRS_LRN_SINGLE ctrtrs_LRN_single
|
||||
#define TRTRS_LCU_SINGLE ctrtrs_LCU_single
|
||||
#define TRTRS_LCN_SINGLE ctrtrs_LCN_single
|
||||
#define TRTRS_UNU_PARALLEL ctrtrs_UNU_parallel
|
||||
#define TRTRS_UNN_PARALLEL ctrtrs_UNN_parallel
|
||||
#define TRTRS_UTU_PARALLEL ctrtrs_UTU_parallel
|
||||
#define TRTRS_UTN_PARALLEL ctrtrs_UTN_parallel
|
||||
#define TRTRS_URU_PARALLEL ctrtrs_URU_parallel
|
||||
#define TRTRS_URN_PARALLEL ctrtrs_URN_parallel
|
||||
#define TRTRS_UCU_PARALLEL ctrtrs_UCU_parallel
|
||||
#define TRTRS_UCN_PARALLEL ctrtrs_UCN_parallel
|
||||
#define TRTRS_LNU_PARALLEL ctrtrs_LNU_parallel
|
||||
#define TRTRS_LNN_PARALLEL ctrtrs_LNN_parallel
|
||||
#define TRTRS_LTU_PARALLEL ctrtrs_LTU_parallel
|
||||
#define TRTRS_LTN_PARALLEL ctrtrs_LTN_parallel
|
||||
#define TRTRS_LRU_PARALLEL ctrtrs_LRU_parallel
|
||||
#define TRTRS_LRN_PARALLEL ctrtrs_LRN_parallel
|
||||
#define TRTRS_LCU_PARALLEL ctrtrs_LCU_parallel
|
||||
#define TRTRS_LCN_PARALLEL ctrtrs_LCN_parallel
|
||||
#endif
|
||||
#endif
|
||||
|
|
|
@ -39,6 +39,35 @@
|
|||
#ifndef COMMON_POWER
|
||||
#define COMMON_POWER
|
||||
|
||||
#define str(x) #x
|
||||
|
||||
#ifdef OS_AIX
|
||||
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
|
||||
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
|
||||
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
|
||||
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
|
||||
#define XVMOVDP(T,A) xvcpsgndp T, A, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
|
||||
|
||||
#else
|
||||
#define XXSPLTD(T,A,z) xxspltd T, A, z
|
||||
#define XXMRGHD(T,A,B) xxmrghd T, A, B
|
||||
#define XXMRGLD(T,A,B) xxmrgld T, A, B
|
||||
#define XXSWAPD(T,A) xxswapd T, A
|
||||
#define XVMOVDP(T,A) xvmovdp T, A
|
||||
|
||||
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
|
||||
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
|
||||
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#define MB __asm__ __volatile__ ("eieio":::"memory")
|
||||
#define WMB __asm__ __volatile__ ("eieio":::"memory")
|
||||
|
@ -241,7 +270,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define HAVE_PREFETCH
|
||||
#endif
|
||||
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
|
||||
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
|
||||
#define DCBT_ARG 0
|
||||
#else
|
||||
#define DCBT_ARG 8
|
||||
|
|
|
@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
|
|||
|
||||
int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
|
||||
|
||||
int beta_thread(int mode, BLASLONG m, BLASLONG n,
|
||||
double alpha_r, double alpha_i,
|
||||
void *c, BLASLONG ldc, int (*fuction)());
|
||||
|
||||
int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
|
||||
void *offsetA, BLASLONG lda,
|
||||
void *offsetB, BLASLONG jb,
|
||||
|
|
|
@ -206,6 +206,33 @@ void get_subdirname(void)
|
|||
printf("arm64");
|
||||
}
|
||||
|
||||
void get_cpucount(void)
|
||||
{
|
||||
int n=0;
|
||||
|
||||
#ifdef linux
|
||||
FILE *infile;
|
||||
char buffer[2048], *p,*t;
|
||||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if (!strncmp("processor", buffer, 9))
|
||||
n++;
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
printf("#define NUM_CORES %d\n",n);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void get_cpuconfig(void)
|
||||
{
|
||||
|
||||
|
@ -309,6 +336,7 @@ void get_cpuconfig(void)
|
|||
printf("#define DTB_SIZE 4096 \n");
|
||||
break;
|
||||
}
|
||||
get_cpucount();
|
||||
}
|
||||
|
||||
|
||||
|
@ -351,5 +379,3 @@ void get_features(void)
|
|||
#endif
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
|
|
18
cpuid_x86.c
18
cpuid_x86.c
|
@ -1197,7 +1197,11 @@ int get_cpuname(void){
|
|||
case 3:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_CORE2;
|
||||
#else
|
||||
return CPUTYPE_PENTIUM2;
|
||||
#endif
|
||||
case 7:
|
||||
case 8:
|
||||
case 10:
|
||||
|
@ -1379,6 +1383,8 @@ int get_cpuname(void){
|
|||
break;
|
||||
case 7: // family 6 exmodel 7
|
||||
switch (model) {
|
||||
case 10: // Goldmont Plus
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Ice Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
|
@ -1425,7 +1431,11 @@ int get_cpuname(void){
|
|||
case 0x5:
|
||||
return CPUTYPE_AMDK6;
|
||||
case 0x6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CPUTYPE_BARCELONA;
|
||||
#else
|
||||
return CPUTYPE_ATHLON;
|
||||
#endif
|
||||
case 0xf:
|
||||
switch (exfamily) {
|
||||
case 0:
|
||||
|
@ -1808,7 +1818,11 @@ int get_coretype(void){
|
|||
case 4:
|
||||
case 5:
|
||||
case 6:
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
return CORE_CORE2;
|
||||
#else
|
||||
return CORE_P6;
|
||||
#endif
|
||||
case 7:
|
||||
return CORE_KATMAI;
|
||||
case 8:
|
||||
|
@ -2015,7 +2029,11 @@ int get_coretype(void){
|
|||
|
||||
if (vendor == VENDOR_AMD){
|
||||
if (family <= 0x5) return CORE_80486;
|
||||
#if defined(__x86_64__) || defined(__amd64__)
|
||||
if (family <= 0xe) return CORE_BARCELONA;
|
||||
#else
|
||||
if (family <= 0xe) return CORE_ATHLON;
|
||||
#endif
|
||||
if (family == 0xf){
|
||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||
else if (exfamily == 5) return CORE_BOBCAT;
|
||||
|
|
|
@ -30,17 +30,20 @@
|
|||
#define CPU_GENERIC 0
|
||||
#define CPU_Z13 1
|
||||
#define CPU_Z14 2
|
||||
#define CPU_Z15 3
|
||||
|
||||
static char *cpuname[] = {
|
||||
"ZARCH_GENERIC",
|
||||
"Z13",
|
||||
"Z14"
|
||||
"Z14",
|
||||
"Z15"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"zarch_generic",
|
||||
"z13",
|
||||
"z14"
|
||||
"z14",
|
||||
"z15"
|
||||
};
|
||||
|
||||
int detect(void)
|
||||
|
@ -66,6 +69,8 @@ int detect(void)
|
|||
if (strstr(p, "2965")) return CPU_Z13;
|
||||
if (strstr(p, "3906")) return CPU_Z14;
|
||||
if (strstr(p, "3907")) return CPU_Z14;
|
||||
if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14
|
||||
|
||||
return CPU_GENERIC;
|
||||
}
|
||||
|
|
|
@ -1503,6 +1503,8 @@ C $ ' .' )
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
|
@ -1504,6 +1504,8 @@ C $ ' .' )
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
|
@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
|||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
1 2 3 5 7 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
|
|
@ -5,7 +5,7 @@ T LOGICAL FLAG, T TO STOP ON FAILURES.
|
|||
T LOGICAL FLAG, T TO TEST ERROR EXITS.
|
||||
2 0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
|
||||
16.0 THRESHOLD VALUE OF TEST RATIO
|
||||
6 NUMBER OF VALUES OF N
|
||||
7 NUMBER OF VALUES OF N
|
||||
0 1 2 3 5 9 35 VALUES OF N
|
||||
3 NUMBER OF VALUES OF ALPHA
|
||||
0.0 1.0 0.7 VALUES OF ALPHA
|
||||
|
|
|
@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
|
|
@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#else
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
START_RPCC();
|
||||
|
|
|
@ -104,7 +104,7 @@ typedef struct {
|
|||
#define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
|
||||
GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
|
||||
BETA[0], BETA[1], NULL, 0, NULL, 0, \
|
||||
(FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
|
||||
(FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
|
||||
#endif
|
||||
|
||||
#ifndef ICOPYB_OPERATION
|
||||
|
@ -408,13 +408,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
|
||||
}
|
||||
WMB;
|
||||
}
|
||||
|
||||
current = mypos;
|
||||
|
||||
|
@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -541,13 +544,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
@ -677,13 +681,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
/* Make sure if no one is using another buffer */
|
||||
for (i = 0; i < args -> nthreads; i++)
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting1);
|
||||
|
||||
for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, xxx + div_n) - jjs;
|
||||
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
|
||||
if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
|
||||
|
||||
START_RPCC();
|
||||
|
||||
|
@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
START_RPCC();
|
||||
|
||||
/* thread has to wait */
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
|
||||
while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
|
||||
|
||||
STOP_RPCC(waiting2);
|
||||
|
||||
|
@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
}
|
||||
|
||||
if (m_to - m_from == min_i) {
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
}
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
} while (current != mypos);
|
||||
|
||||
|
@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#endif
|
||||
if (is + min_i >= m_to) {
|
||||
/* Thread doesn't need this buffer any more */
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
|
||||
job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
|
||||
WMB;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
|
||||
for (i = 0; i < args -> nthreads; i++) {
|
||||
for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
|
||||
while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
||||
*range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
static pthread_mutex_t level3_lock = PTHREAD_MUTEX_INITIALIZER;
|
||||
#else
|
||||
CRITICAL_SECTION level3_lock;
|
||||
InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
blas_arg_t newarg;
|
||||
|
||||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
mode = BLAS_SINGLE | BLAS_REAL | BLAS_NODE;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_lock(&level3_lock);
|
||||
#else
|
||||
EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
newarg.m = args -> m;
|
||||
newarg.n = args -> n;
|
||||
newarg.k = args -> k;
|
||||
|
@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
|
|||
free(job);
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
#ifndef OS_WINDOWS
|
||||
pthread_mutex_unlock(&level3_lock);
|
||||
#else
|
||||
LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
/* Split local region of B into parts */
|
||||
for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
|
||||
min_jj = MIN(n_to, js + div_n) - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
/* Copy part of local region of B into workspace */
|
||||
START_RPCC();
|
||||
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
|
||||
|
|
|
@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
|
@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
|
@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
|
@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
START_RPCC();
|
||||
|
||||
GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,
|
||||
|
|
|
@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < ls - js; jjs += min_jj){
|
||||
min_jj = ls - js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
|
@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
|
||||
#else
|
||||
|
@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
|
@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < min_l; jjs += min_jj){
|
||||
min_jj = min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
|
||||
#else
|
||||
|
@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
|
||||
min_jj = js - ls - min_l - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
|
||||
sb + min_l * (min_l + jjs) * COMPSIZE);
|
||||
|
@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
|
||||
for(jjs = js; jjs < js + min_j; jjs += min_jj){
|
||||
min_jj = min_j + js - jjs;
|
||||
#ifdef SKYLAKEX
|
||||
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
|
||||
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
|
||||
#else
|
||||
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
|
||||
else
|
||||
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
|
||||
|
||||
#endif
|
||||
#ifndef TRANSA
|
||||
GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
|
||||
#else
|
||||
|
|
|
@ -21,9 +21,13 @@ else
|
|||
ifeq ($(ARCH),power)
|
||||
COMMONOBJS += dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
COMMONOBJS += dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
COMMONOBJS += dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
COMMONOBJS += parameter.$(SUFFIX)
|
||||
endif
|
||||
|
@ -85,9 +89,13 @@ else
|
|||
ifeq ($(ARCH),power)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
|
||||
else
|
||||
ifeq ($(ARCH),zarch)
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||
endif
|
||||
|
|
|
@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
// Could also just use WaitForMultipleObjects
|
||||
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
||||
#ifndef OS_WINDOWSSTORE
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
TerminateThread(blas_threads[i],0);
|
||||
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
|
||||
if (WAIT_OBJECT_0 != wait_thread_value) {
|
||||
TerminateThread(blas_threads[i],0);
|
||||
}
|
||||
#endif
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
|
||||
|
|
|
@ -329,7 +329,7 @@ int support_avx512(){
|
|||
if (!support_avx())
|
||||
return 0;
|
||||
cpuid(7, &eax, &ebx, &ecx, &edx);
|
||||
if((ebx & (1<<7)) != 1){
|
||||
if((ebx & (1<<7)) == 0){
|
||||
ret=0; //OS does not even support AVX2
|
||||
}
|
||||
if((ebx & (1<<31)) != 0){
|
||||
|
@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
return NULL;
|
||||
case 7:
|
||||
if (model == 10) // Goldmont Plus
|
||||
return &gotoblas_NEHALEM;
|
||||
if (model == 14) {
|
||||
// Ice Lake
|
||||
if (support_avx512())
|
||||
|
|
|
@ -37,17 +37,24 @@
|
|||
/*********************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
#include <asm/hwcap.h>
|
||||
#include <sys/auxv.h>
|
||||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
#define NUM_CORETYPES 9
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -63,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
|
||||
static char *corename[] = {
|
||||
"armv8",
|
||||
"cortexa53",
|
||||
"cortexa57",
|
||||
"cortexa72",
|
||||
"cortexa73",
|
||||
"falkor",
|
||||
"thunderx",
|
||||
"thunderx2t99",
|
||||
"tsv110",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
|
||||
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
|
||||
if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
|
||||
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
|
||||
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
|
||||
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
|
||||
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
|
||||
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -94,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
switch (found)
|
||||
{
|
||||
case 0: return (&gotoblas_ARMV8);
|
||||
case 1: return (&gotoblas_CORTEXA57);
|
||||
case 2: return (&gotoblas_THUNDERX);
|
||||
case 3: return (&gotoblas_THUNDERX2T99);
|
||||
case 1: return (&gotoblas_CORTEXA53);
|
||||
case 2: return (&gotoblas_CORTEXA57);
|
||||
case 3: return (&gotoblas_CORTEXA72);
|
||||
case 4: return (&gotoblas_CORTEXA73);
|
||||
case 5: return (&gotoblas_FALKOR);
|
||||
case 6: return (&gotoblas_THUNDERX);
|
||||
case 7: return (&gotoblas_THUNDERX2T99);
|
||||
case 8: return (&gotoblas_TSV110);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -105,13 +127,17 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
int implementer, variant, part, arch, revision, midr_el1;
|
||||
|
||||
|
||||
#if (defined OS_LINUX || defined OS_ANDROID)
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
|
||||
char coremsg[128];
|
||||
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
|
||||
openblas_warning(1, coremsg);
|
||||
return NULL;
|
||||
}
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
|
||||
get_cpu_ftr(MIDR_EL1, midr_el1);
|
||||
/*
|
||||
|
@ -130,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
|
|||
case 0x41: // ARM
|
||||
switch (part)
|
||||
{
|
||||
case 0xd07: // Cortex A57
|
||||
case 0xd08: // Cortex A72
|
||||
case 0xd03: // Cortex A53
|
||||
return &gotoblas_CORTEXA53;
|
||||
case 0xd07: // Cortex A57
|
||||
return &gotoblas_CORTEXA57;
|
||||
case 0xd08: // Cortex A72
|
||||
return &gotoblas_CORTEXA72;
|
||||
case 0xd09: // Cortex A73
|
||||
return &gotoblas_CORTEXA73;
|
||||
}
|
||||
break;
|
||||
case 0x42: // Broadcom
|
||||
|
@ -152,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_THUNDERX2T99;
|
||||
}
|
||||
break;
|
||||
case 0x48: // HiSilicon
|
||||
switch (part)
|
||||
{
|
||||
case 0xd01: // tsv110
|
||||
return &gotoblas_TSV110;
|
||||
}
|
||||
break;
|
||||
case 0x51: // Qualcomm
|
||||
switch (part)
|
||||
{
|
||||
case 0xc00: // Falkor
|
||||
return &gotoblas_FALKOR;
|
||||
}
|
||||
break;
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
|
|
@ -3,7 +3,9 @@
|
|||
|
||||
extern gotoblas_t gotoblas_POWER6;
|
||||
extern gotoblas_t gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
extern gotoblas_t gotoblas_POWER9;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char *msg);
|
||||
|
||||
|
@ -19,7 +21,9 @@ static char *corename[] = {
|
|||
char *gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
#endif
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
|
@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_POWER6;
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
if (__builtin_cpu_is("power9"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
|||
{
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
|
||||
#include "common.h"
|
||||
|
||||
extern gotoblas_t gotoblas_Z13;
|
||||
extern gotoblas_t gotoblas_Z14;
|
||||
extern gotoblas_t gotoblas_Z15;
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
//extern gotoblas_t gotoblas_Z14;
|
||||
//#endif
|
||||
|
||||
#define NUM_CORETYPES 5
|
||||
|
||||
extern void openblas_warning(int verbose, const char* msg);
|
||||
|
||||
static char* corename[] = {
|
||||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
"Z15",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
if (gotoblas == &gotoblas_Z15) return corename[3];
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
//#endif
|
||||
return corename[0]; // try generic?
|
||||
}
|
||||
|
||||
// __builtin_cpu_is is not supported by zarch
|
||||
static gotolabs_t* get_coretype(void) {
|
||||
FILE* infile;
|
||||
char buffer[512], * p;
|
||||
|
||||
p = (char*)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if (!strncmp("Type", buffer, 4)) {
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
|
||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
|
||||
return NULL; // should be ZARCH_GENERIC
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
||||
int i;
|
||||
int found = -1;
|
||||
char message[128];
|
||||
|
||||
for (i = 0; i < NUM_CORETYPES; i++)
|
||||
{
|
||||
if (!strncasecmp(coretype, corename[i], 20))
|
||||
{
|
||||
found = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
switch (found)
|
||||
{
|
||||
case 1: return (&gotoblas_Z13);
|
||||
case 2: return (&gotoblas_Z14);
|
||||
case 3: return (&gotoblas_Z15);
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// case 3: return (&gotoblas_POWER9);
|
||||
//#endif
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_init(void) {
|
||||
|
||||
char coremsg[128];
|
||||
char coren[22];
|
||||
char* p;
|
||||
|
||||
|
||||
if (gotoblas) return;
|
||||
|
||||
p = getenv("OPENBLAS_CORETYPE");
|
||||
if (p)
|
||||
{
|
||||
gotoblas = force_coretype(p);
|
||||
}
|
||||
else
|
||||
{
|
||||
gotoblas = get_coretype();
|
||||
}
|
||||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_Z14;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
strncpy(coren, gotoblas_corename(), 20);
|
||||
sprintf(coremsg, "Core: %s\n", coren);
|
||||
openblas_warning(2, coremsg);
|
||||
gotoblas->init();
|
||||
}
|
||||
else {
|
||||
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
void gotoblas_dynamic_quit(void) {
|
||||
gotoblas = NULL;
|
||||
}
|
|
@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
|
|||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
|
@ -312,7 +312,7 @@ int get_num_procs(void) {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
|
@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){
|
|||
|
||||
static void alloc_windows_free(struct alloc_t *alloc_info){
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
|
@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){
|
|||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
|
@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
|
|||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
|
@ -1855,7 +1855,7 @@ int get_num_procs(void) {
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
|
@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
|
|||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
|
@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){
|
|||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
|
@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){
|
|||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
|
@ -2310,7 +2310,7 @@ static void *alloc_qalloc(void *address){
|
|||
|
||||
static void alloc_windows_free(struct release_t *release){
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_RELEASE);
|
||||
|
||||
}
|
||||
|
||||
|
@ -2432,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){
|
|||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
|
||||
VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -38,21 +38,29 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
#ifndef SMP
|
||||
#define blas_cpu_number 1
|
||||
#else
|
||||
|
||||
int blas_cpu_number = 1;
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
|
||||
return blas_cpu_number;
|
||||
}
|
||||
#ifdef OS_LINUX
|
||||
#include <sys/sysinfo.h>
|
||||
#include <sched.h>
|
||||
#include <errno.h>
|
||||
#include <linux/unistd.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
|
||||
#define FIXED_PAGESIZE 4096
|
||||
|
||||
|
||||
void *sa = NULL;
|
||||
void *sb = NULL;
|
||||
static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
||||
|
@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
|
|||
void *blas_memory_alloc(int numproc){
|
||||
|
||||
if (sa == NULL){
|
||||
#if 1
|
||||
#if 0
|
||||
sa = (void *)qalloc(QFAST, BUFFER_SIZE);
|
||||
#else
|
||||
sa = (void *)malloc(BUFFER_SIZE);
|
||||
|
@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
|
|||
return;
|
||||
}
|
||||
|
||||
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
#ifndef SMP
|
||||
|
||||
#define blas_cpu_number 1
|
||||
#define blas_num_threads 1
|
||||
|
||||
/* Dummy Function */
|
||||
int goto_get_num_procs (void) { return 1;};
|
||||
void goto_set_num_threads(int num_threads) {};
|
||||
|
||||
#else
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_SUNOS)
|
||||
#ifndef NO_AFFINITY
|
||||
int get_num_procs(void);
|
||||
#else
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
int n;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
#if !defined(OS_LINUX)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
/*
|
||||
#if !defined(__GLIBC_PREREQ)
|
||||
return nums;
|
||||
#else
|
||||
#if !__GLIBC_PREREQ(2, 3)
|
||||
return nums;
|
||||
#endif
|
||||
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
|
||||
if (ret!=0) return nums;
|
||||
n=0;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
for (i=0;i<nums;i++)
|
||||
if (CPU_ISSET(i,&cpuset)) n++;
|
||||
nums=n;
|
||||
#else
|
||||
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
|
||||
#endif
|
||||
return nums;
|
||||
#else
|
||||
if (nums >= CPU_SETSIZE) {
|
||||
cpusetp = CPU_ALLOC(nums);
|
||||
if (cpusetp == NULL) {
|
||||
return nums;
|
||||
}
|
||||
size = CPU_ALLOC_SIZE(nums);
|
||||
ret = sched_getaffinity(0,size,cpusetp);
|
||||
if (ret!=0) {
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT_S(size,cpusetp);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
CPU_FREE(cpusetp);
|
||||
return nums;
|
||||
} else {
|
||||
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
|
||||
if (ret!=0) {
|
||||
return nums;
|
||||
}
|
||||
ret = CPU_COUNT(&cpuset);
|
||||
if (ret > 0 && ret < nums) nums = ret;
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
*/
|
||||
return 1;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef OS_ANDROID
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_HAIKU
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_AIX
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
|
||||
return nums;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef OS_WINDOWS
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
if (nums == 0) {
|
||||
|
||||
SYSTEM_INFO sysinfo;
|
||||
|
||||
GetSystemInfo(&sysinfo);
|
||||
|
||||
nums = sysinfo.dwNumberOfProcessors;
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
|
||||
|
||||
int get_num_procs(void) {
|
||||
|
||||
static int nums = 0;
|
||||
|
||||
int m[2];
|
||||
size_t len;
|
||||
|
||||
if (nums == 0) {
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
sysctl(m, 2, &nums, &len, NULL, 0);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(OS_DARWIN)
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
size_t len;
|
||||
if (nums == 0){
|
||||
len = sizeof(int);
|
||||
sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
|
||||
}
|
||||
return nums;
|
||||
}
|
||||
/*
|
||||
void set_stack_limit(int limitMB){
|
||||
int result=0;
|
||||
struct rlimit rl;
|
||||
rlim_t StackSize;
|
||||
|
||||
StackSize=limitMB*1024*1024;
|
||||
result=getrlimit(RLIMIT_STACK, &rl);
|
||||
if(result==0){
|
||||
if(rl.rlim_cur < StackSize){
|
||||
rl.rlim_cur=StackSize;
|
||||
result=setrlimit(RLIMIT_STACK, &rl);
|
||||
if(result !=0){
|
||||
fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
*/
|
||||
#endif
|
||||
|
||||
|
||||
/*
|
||||
OpenBLAS uses the numbers of CPU cores in multithreading.
|
||||
It can be set by openblas_set_num_threads(int num_threads);
|
||||
*/
|
||||
int blas_cpu_number = 0;
|
||||
/*
|
||||
The numbers of threads in the thread pool.
|
||||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
// This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
|
||||
// built with "make USE_OPENMP=0".
|
||||
// Hanging can still happen when OpenBLAS is built against the libgomp
|
||||
// implementation of OpenMP. The problem is tracked at:
|
||||
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
|
||||
// In the mean time build with USE_OPENMP=0 or link against another
|
||||
// implementation of OpenMP.
|
||||
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
|
||||
int err;
|
||||
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
|
||||
#endif
|
||||
}
|
||||
|
||||
extern int openblas_num_threads_env();
|
||||
extern int openblas_goto_num_threads_env();
|
||||
extern int openblas_omp_num_threads_env();
|
||||
|
||||
int blas_get_cpu_number(void){
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
int max_num;
|
||||
#endif
|
||||
int blas_goto_num = 0;
|
||||
int blas_omp_num = 0;
|
||||
|
||||
if (blas_num_threads) return blas_num_threads;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
max_num = get_num_procs();
|
||||
#endif
|
||||
|
||||
// blas_goto_num = 0;
|
||||
#ifndef USE_OPENMP
|
||||
blas_goto_num=openblas_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
|
||||
if (blas_goto_num == 0) {
|
||||
blas_goto_num=openblas_goto_num_threads_env();
|
||||
if (blas_goto_num < 0) blas_goto_num = 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
// blas_omp_num = 0;
|
||||
blas_omp_num=openblas_omp_num_threads_env();
|
||||
if (blas_omp_num < 0) blas_omp_num = 0;
|
||||
|
||||
if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
|
||||
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
|
||||
else blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
|
||||
if (blas_num_threads > max_num) blas_num_threads = max_num;
|
||||
#endif
|
||||
|
||||
if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
#ifdef DEBUG
|
||||
printf( "Adjusted number of threads : %3d\n", blas_num_threads);
|
||||
#endif
|
||||
|
||||
blas_cpu_number = blas_num_threads;
|
||||
|
||||
return blas_num_threads;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
int openblas_get_num_procs(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
return get_num_procs();
|
||||
#endif
|
||||
}
|
||||
|
||||
int openblas_get_num_threads(void) {
|
||||
#ifndef SMP
|
||||
return 1;
|
||||
#else
|
||||
// init blas_cpu_number if needed
|
||||
blas_get_cpu_number();
|
||||
return blas_cpu_number;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -78,10 +78,10 @@ char tmpstr[20];
|
|||
#ifdef DYNAMIC_ARCH
|
||||
strcat(tmp_config_str, gotoblas_corename());
|
||||
#endif
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
if (openblas_get_parallel() == 0)
|
||||
sprintf(tmpstr, " SINGLE_THREADED");
|
||||
else
|
||||
snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
|
||||
strcat(tmp_config_str, tmpstr);
|
||||
return tmp_config_str;
|
||||
}
|
||||
|
|
|
@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
|
|||
gotoblas_init();
|
||||
break;
|
||||
case DLL_PROCESS_DETACH:
|
||||
gotoblas_quit();
|
||||
// If the process is about to exit, don't bother releasing any resources
|
||||
// The kernel is much better at bulk releasing then.
|
||||
if (!reserved)
|
||||
gotoblas_quit();
|
||||
break;
|
||||
case DLL_THREAD_ATTACH:
|
||||
break;
|
||||
|
|
|
@ -618,19 +618,6 @@
|
|||
# functions added for lapack-3.7.0
|
||||
|
||||
slarfy,
|
||||
slasyf_rk,
|
||||
ssyconvf_rook,
|
||||
ssytf2_rk,
|
||||
ssytrf_rk,
|
||||
ssytrs_3,
|
||||
ssytri_3,
|
||||
ssytri_3x,
|
||||
ssycon_3,
|
||||
ssysv_rk,
|
||||
slasyf_aa,
|
||||
ssysv_aa,
|
||||
ssytrf_aa,
|
||||
ssytrs_aa,
|
||||
strevc3,
|
||||
sgelqt,
|
||||
sgelqt3,
|
||||
|
@ -647,33 +634,8 @@
|
|||
stplqt,
|
||||
stplqt2,
|
||||
stpmlqt,
|
||||
ssytrd_2stage,
|
||||
ssytrd_sy2sb,
|
||||
ssytrd_sb2st,
|
||||
ssb2st_kernels,
|
||||
ssyevd_2stage,
|
||||
ssyev_2stage,
|
||||
ssyevx_2stage,
|
||||
ssyevr_2stage,
|
||||
ssbev_2stage,
|
||||
ssbevx_2stage,
|
||||
ssbevd_2stage,
|
||||
ssygv_2stage,
|
||||
dlarfy,
|
||||
dlasyf_rk,
|
||||
dsyconvf,
|
||||
dsyconvf_rook,
|
||||
dsytf2_rk,
|
||||
dsytrf_rk,
|
||||
dsytrs_3,
|
||||
dsytri_3,
|
||||
dsytri_3x,
|
||||
dsycon_3,
|
||||
dsysv_rk,
|
||||
dlasyf_aa,
|
||||
dsysv_aa,
|
||||
dsytrf_aa,
|
||||
dsytrs_aa,
|
||||
dtrevc3,
|
||||
dgelqt,
|
||||
dgelqt3,
|
||||
|
@ -690,45 +652,8 @@
|
|||
dtplqt,
|
||||
dtplqt2,
|
||||
dtpmlqt,
|
||||
dsytrd_2stage,
|
||||
dsytrd_sy2sb,
|
||||
dsytrd_sb2st,
|
||||
dsb2st_kernels,
|
||||
dsyevd_2stage,
|
||||
dsyev_2stage,
|
||||
dsyevx_2stage,
|
||||
dsyevr_2stage,
|
||||
dsbev_2stage,
|
||||
dsbevx_2stage,
|
||||
dsbevd_2stage,
|
||||
dsygv_2stage,
|
||||
chetf2_rk,
|
||||
chetrf_rk,
|
||||
chetri_3,
|
||||
chetri_3x,
|
||||
chetrs_3,
|
||||
checon_3,
|
||||
chesv_rk,
|
||||
chesv_aa,
|
||||
chetrf_aa,
|
||||
chetrs_aa,
|
||||
clahef_aa,
|
||||
clahef_rk,
|
||||
clarfy,
|
||||
clasyf_rk,
|
||||
clasyf_aa,
|
||||
csyconvf,
|
||||
csyconvf_rook,
|
||||
csytf2_rk,
|
||||
csytrf_rk,
|
||||
csytrf_aa,
|
||||
csytrs_3,
|
||||
csytrs_aa,
|
||||
csytri_3,
|
||||
csytri_3x,
|
||||
csycon_3,
|
||||
csysv_rk,
|
||||
csysv_aa,
|
||||
ctrevc3,
|
||||
cgelqt,
|
||||
cgelqt3,
|
||||
|
@ -745,45 +670,8 @@
|
|||
ctplqt,
|
||||
ctplqt2,
|
||||
ctpmlqt,
|
||||
chetrd_2stage,
|
||||
chetrd_he2hb,
|
||||
chetrd_hb2st,
|
||||
chb2st_kernels,
|
||||
cheevd_2stage,
|
||||
cheev_2stage,
|
||||
cheevx_2stage,
|
||||
cheevr_2stage,
|
||||
chbev_2stage,
|
||||
chbevx_2stage,
|
||||
chbevd_2stage,
|
||||
chegv_2stage,
|
||||
zhetf2_rk,
|
||||
zhetrf_rk,
|
||||
zhetri_3,
|
||||
zhetri_3x,
|
||||
zhetrs_3,
|
||||
zhecon_3,
|
||||
zhesv_rk,
|
||||
zhesv_aa,
|
||||
zhetrf_aa,
|
||||
zhetrs_aa,
|
||||
zlahef_aa,
|
||||
zlahef_rk,
|
||||
zlarfy,
|
||||
zlasyf_rk,
|
||||
zlasyf_aa,
|
||||
zsyconvf,
|
||||
zsyconvf_rook,
|
||||
zsytrs_aa,
|
||||
zsytf2_rk,
|
||||
zsytrf_rk,
|
||||
zsytrf_aa,
|
||||
zsytrs_3,
|
||||
zsytri_3,
|
||||
zsytri_3x,
|
||||
zsycon_3,
|
||||
zsysv_rk,
|
||||
zsysv_aa,
|
||||
ztrevc3,
|
||||
ztplqt,
|
||||
ztplqt2,
|
||||
|
@ -800,18 +688,6 @@
|
|||
zlaswlq,
|
||||
zlamswlq,
|
||||
zgemlq,
|
||||
zhetrd_2stage,
|
||||
zhetrd_he2hb,
|
||||
zhetrd_hb2st,
|
||||
zhb2st_kernels,
|
||||
zheevd_2stage,
|
||||
zheev_2stage,
|
||||
zheevx_2stage,
|
||||
zheevr_2stage,
|
||||
zhbev_2stage,
|
||||
zhbevx_2stage,
|
||||
zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
sladiv1,
|
||||
dladiv1,
|
||||
iparam2stage,
|
||||
|
@ -819,24 +695,18 @@
|
|||
# functions added for lapack-3.8.0
|
||||
|
||||
ilaenv2stage,
|
||||
ssysv_aa_2stage,
|
||||
ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage,
|
||||
chesv_aa_2stage,
|
||||
chetrf_aa_2stage,
|
||||
chetrs_aa_2stage,
|
||||
csysv_aa_2stage,
|
||||
csytrf_aa_2stage,
|
||||
csytrs_aa_2stage,
|
||||
dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage,
|
||||
dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage,
|
||||
zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage,
|
||||
zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage,
|
||||
zsytrs_aa_2stage
|
||||
|
||||
# functions added for lapack-3.9.0
|
||||
cgesvdq,
|
||||
cungtsqr,
|
||||
dcombssq,
|
||||
dgesvdq,
|
||||
dorgtsqr,
|
||||
scombssq,
|
||||
sgesvdq,
|
||||
sorgtsqr,
|
||||
zgesvdq,
|
||||
zungtsqr
|
||||
);
|
||||
|
||||
@lapack_extendedprecision_objs = (
|
||||
|
@ -3489,6 +3359,15 @@
|
|||
LAPACKE_zsytrf_aa_2stage_work,
|
||||
LAPACKE_zsytrs_aa_2stage,
|
||||
LAPACKE_zsytrs_aa_2stage_work,
|
||||
|
||||
# new functions from 3.9.0
|
||||
LAPACKE_dgesvdq,
|
||||
LAPACKE_dgesvdq_work,
|
||||
LAPACKE_sgesvdq,
|
||||
LAPACKE_sgesvdq_work,
|
||||
LAPACKE_zgesvdq,
|
||||
LAPACKE_zgesvdq_work
|
||||
|
||||
);
|
||||
|
||||
#These function may need 2 underscores.
|
||||
|
@ -3509,6 +3388,65 @@
|
|||
zlahef_rook, zlasyf_rook,
|
||||
zsytf2_rook, zsytrf_rook, zsytrs_rook,
|
||||
zsytri_rook, zsycon_rook, zsysv_rook,
|
||||
# 3.7.0
|
||||
slasyf_rk, ssyconvf_rook, ssytf2_rk,
|
||||
ssytrf_rk, ssytrs_3, ssytri_3,
|
||||
ssytri_3x, ssycon_3, ssysv_rk,
|
||||
slasyf_aa, ssysv_aa, ssytrf_aa,
|
||||
ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
|
||||
ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
|
||||
ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
|
||||
ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
|
||||
ssygv_2stage, dlasyf_rk, dsyconvf_rook,
|
||||
dsytf2_rk, dsytrf_rk, dsytrs_3,
|
||||
dsytri_3, dsytri_3x, dsycon_3,
|
||||
dsysv_rk, dlasyf_aa, dsysv_aa,
|
||||
dsytrf_aa, dsytrs_aa, dsytrd_2stage,
|
||||
dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
|
||||
dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
|
||||
dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
|
||||
dsbevd_2stage, dsygv_2stage, chetf2_rk,
|
||||
chetrf_rk, chetri_3, chetri_3x,
|
||||
chetrs_3, checon_3, chesv_rk,
|
||||
chesv_aa, chetrf_aa, chetrs_aa,
|
||||
clahef_aa, clahef_rk, clasyf_rk,
|
||||
clasyf_aa, csytf2_rk, csytrf_rk,
|
||||
csytrf_aa, csytrs_3, csytrs_aa,
|
||||
csytri_3, csytri_3x, csycon_3,
|
||||
csysv_rk, csysv_aa, csyconvf_rook,
|
||||
chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
|
||||
chb2st_kernels, cheevd_2stage, cheev_2stage,
|
||||
cheevx_2stage, cheevr_2stage, chbev_2stage,
|
||||
chbevx_2stage, chbevd_2stage, chegv_2stage,
|
||||
zhetf2_rk, zhetrf_rk, zhetri_3,
|
||||
zhetri_3x, zhetrs_3, zhecon_3,
|
||||
zhesv_rk, zhesv_aa, zhetrf_aa,
|
||||
zhetrs_aa, zlahef_aa, zlahef_rk,
|
||||
zlasyf_rk, zlasyf_aa, zsyconvf_rook,
|
||||
zsytrs_aa, zsytf2_rk, zsytrf_rk,
|
||||
zsytrf_aa, zsytrs_3, zsytri_3,
|
||||
zsytri_3x, zsycon_3, zsysv_rk,
|
||||
zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
|
||||
zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
|
||||
zheev_2stage, zheevx_2stage, zheevr_2stage,
|
||||
zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
|
||||
zhegv_2stage,
|
||||
# 3.8.0
|
||||
ssysv_aa_2stage, ssytrf_aa_2stage,
|
||||
ssytrs_aa_2stage, chesv_aa_2stage,
|
||||
chetrf_aa_2stage, chetrs_aa_2stage,
|
||||
csysv_aa_2stage, csytrf_aa_2stage,
|
||||
csytrs_aa_2stage, dsysv_aa_2stage,
|
||||
dsytrf_aa_2stage, dsytrs_aa_2stage,
|
||||
zhesv_aa_2stage, zhetrf_aa_2stage,
|
||||
zhetrs_aa_2stage, zsysv_aa_2stage,
|
||||
zsytrf_aa_2stage, zsytrs_aa_2stage,
|
||||
# 3.9.0
|
||||
claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
|
||||
dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
|
||||
slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
|
||||
zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
|
||||
|
||||
);
|
||||
|
||||
|
||||
|
|
12
f_check
12
f_check
|
@ -19,7 +19,7 @@ $nofortran = 0;
|
|||
|
||||
$compiler = join(" ", @ARGV);
|
||||
$compiler_bin = shift(@ARGV);
|
||||
|
||||
|
||||
# f77 is too ambiguous
|
||||
$compiler = "" if $compiler eq "f77";
|
||||
|
||||
|
@ -71,7 +71,7 @@ if ($compiler eq "") {
|
|||
|
||||
if ($data =~ /GNU/) {
|
||||
|
||||
$data =~ /(\d)\.(\d).(\d)/;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
||||
|
@ -130,6 +130,11 @@ if ($compiler eq "") {
|
|||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
if ($vendor =~ /G95/) {
|
||||
if ($ENV{NO_LAPACKE} != 1) {
|
||||
$need2bu = "";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
|
@ -277,6 +282,8 @@ $linker_a = "";
|
|||
if ($link ne "") {
|
||||
|
||||
$link =~ s/\-Y\sP\,/\-Y/g;
|
||||
|
||||
$link =~ s/\-R\s*/\-rpath\@/g;
|
||||
|
||||
$link =~ s/\-rpath\s+/\-rpath\@/g;
|
||||
|
||||
|
@ -327,6 +334,7 @@ if ($link ne "") {
|
|||
&& ($flags !~ /kernel32/)
|
||||
&& ($flags !~ /advapi32/)
|
||||
&& ($flags !~ /shell32/)
|
||||
&& ($flags !~ /omp/)
|
||||
&& ($flags !~ /^\-l$/)
|
||||
) {
|
||||
$linker_l .= $flags . " ";
|
||||
|
|
|
@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef OS_WINDOWS
|
||||
#include <windows.h>
|
||||
#endif
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
|
|||
|
||||
#ifdef OS_WINDOWS
|
||||
SYSTEM_INFO sysinfo;
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
int m[2], count;
|
||||
size_t len;
|
||||
#endif
|
||||
|
@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
|
|||
GetSystemInfo(&sysinfo);
|
||||
return sysinfo.dwNumberOfProcessors;
|
||||
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
|
||||
m[0] = CTL_HW;
|
||||
m[1] = HW_NCPU;
|
||||
len = sizeof(int);
|
||||
|
|
|
@ -394,7 +394,7 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
|
|||
SLAPACKOBJS = \
|
||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
|
||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
|
||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#DLAPACKOBJS = \
|
||||
|
@ -405,14 +405,14 @@ SLAPACKOBJS = \
|
|||
DLAPACKOBJS = \
|
||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
|
||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
|
||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
QLAPACKOBJS = \
|
||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
|
||||
qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
|
||||
qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
|
||||
qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX) qpotri.$(SUFFIX) \
|
||||
qtrtrs.$(SUFFIX)
|
||||
|
||||
#CLAPACKOBJS = \
|
||||
# cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
|
@ -423,7 +423,7 @@ QLAPACKOBJS = \
|
|||
CLAPACKOBJS = \
|
||||
cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
|
||||
cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX)
|
||||
clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
#ZLAPACKOBJS = \
|
||||
|
@ -435,13 +435,14 @@ CLAPACKOBJS = \
|
|||
ZLAPACKOBJS = \
|
||||
zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
|
||||
zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX) zlauu2.$(SUFFIX) \
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX)
|
||||
zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
|
||||
|
||||
|
||||
XLAPACKOBJS = \
|
||||
xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
|
||||
xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX) xpotri.$(SUFFIX) \
|
||||
xtrtrs.$(SUFFIX)
|
||||
|
||||
ifneq ($(NO_LAPACK), 1)
|
||||
SBLASOBJS += $(SLAPACKOBJS)
|
||||
|
@ -2031,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
|
|||
dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
|
||||
qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
|
@ -2040,7 +2041,25 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
|||
zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
|
||||
xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
|
||||
|
|
|
@ -44,19 +44,19 @@
|
|||
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QGESV "
|
||||
#define ERROR_NAME "QGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DGESV "
|
||||
#define ERROR_NAME "DGESV"
|
||||
#else
|
||||
#define ERROR_NAME "SGESV "
|
||||
#define ERROR_NAME "SGESV"
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XGESV "
|
||||
#define ERROR_NAME "XGESV"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZGESV "
|
||||
#define ERROR_NAME "ZGESV"
|
||||
#else
|
||||
#define ERROR_NAME "CGESV "
|
||||
#define ERROR_NAME "CGESV"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
|
|||
if (args.m < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
|||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
|||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
|||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
|||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
|||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "QTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "DTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "STRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 0;
|
||||
if (trans_arg == 'C') trans = 1;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
|
@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
|||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
|
|||
if (args.n < 0) info = 2;
|
||||
if (args.m < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
|||
if (trans < 0) info = 1;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (args.n < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
|
|||
if (uplo < 0) info = 1;
|
||||
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
|||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
|
|||
if (diag < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (info) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
/* without modification, are permitted provided that the following */
|
||||
/* conditions are met: */
|
||||
/* */
|
||||
/* 1. Redistributions of source code must retain the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer. */
|
||||
/* */
|
||||
/* 2. Redistributions in binary form must reproduce the above */
|
||||
/* copyright notice, this list of conditions and the following */
|
||||
/* disclaimer in the documentation and/or other materials */
|
||||
/* provided with the distribution. */
|
||||
/* */
|
||||
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
|
||||
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
|
||||
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
|
||||
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
|
||||
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
|
||||
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
|
||||
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
|
||||
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
|
||||
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
|
||||
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
|
||||
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
|
||||
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
|
||||
/* POSSIBILITY OF SUCH DAMAGE. */
|
||||
/* */
|
||||
/* The views and conclusions contained in the software and */
|
||||
/* documentation are those of the authors and should not be */
|
||||
/* interpreted as representing official policies, either expressed */
|
||||
/* or implied, of The University of Texas at Austin. */
|
||||
/*********************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
#ifdef FUNCTION_PROFILE
|
||||
#include "functable.h"
|
||||
#endif
|
||||
|
||||
#ifdef XDOUBLE
|
||||
#define ERROR_NAME "XTRTRS"
|
||||
#elif defined(DOUBLE)
|
||||
#define ERROR_NAME "ZTRTRS"
|
||||
#else
|
||||
#define ERROR_NAME "CTRTRS"
|
||||
#endif
|
||||
|
||||
static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_URU_SINGLE, TRTRS_URN_SINGLE, TRTRS_UCU_SINGLE, TRTRS_UCN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, TRTRS_LRU_SINGLE, TRTRS_LRN_SINGLE, TRTRS_LCU_SINGLE, TRTRS_LCN_SINGLE,
|
||||
};
|
||||
|
||||
#ifdef SMP
|
||||
static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_URU_PARALLEL, TRTRS_URN_PARALLEL, TRTRS_UCU_PARALLEL, TRTRS_UCN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, TRTRS_LRU_PARALLEL, TRTRS_LRN_PARALLEL, TRTRS_LCU_PARALLEL, TRTRS_LCN_PARALLEL,
|
||||
};
|
||||
#endif
|
||||
|
||||
int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
|
||||
FLOAT *b, blasint *ldB, blasint *Info){
|
||||
|
||||
char uplo_arg = *UPLO;
|
||||
char trans_arg = *TRANS;
|
||||
char diag_arg = *DIAG;
|
||||
|
||||
blas_arg_t args;
|
||||
|
||||
blasint info;
|
||||
int uplo, trans, diag;
|
||||
FLOAT *buffer;
|
||||
#ifdef PPC440
|
||||
extern
|
||||
#endif
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
PRINT_DEBUG_NAME;
|
||||
|
||||
args.m = *N;
|
||||
args.n = *NRHS;
|
||||
args.a = (void *)a;
|
||||
args.lda = *ldA;
|
||||
args.b = (void *)b;
|
||||
args.ldb = *ldB;
|
||||
|
||||
info = 0;
|
||||
|
||||
TOUPPER(trans_arg);
|
||||
trans = -1;
|
||||
if (trans_arg == 'N') trans = 0;
|
||||
if (trans_arg == 'T') trans = 1;
|
||||
if (trans_arg == 'R') trans = 2;
|
||||
if (trans_arg == 'C') trans = 3;
|
||||
|
||||
uplo = -1;
|
||||
if (uplo_arg == 'U') uplo = 0;
|
||||
if (uplo_arg == 'L') uplo = 1;
|
||||
|
||||
diag = -1;
|
||||
if (diag_arg == 'U') diag = 0;
|
||||
if (diag_arg == 'N') diag = 1;
|
||||
|
||||
if (args.ldb < MAX(1, args.m)) info = 9;
|
||||
if (args.lda < MAX(1, args.m)) info = 7;
|
||||
if (args.n < 0) info = 5;
|
||||
if (args.m < 0) info = 4;
|
||||
if (trans < 0) info = 2;
|
||||
if (uplo < 0) info = 1;
|
||||
if (diag < 0) info = 3;
|
||||
|
||||
if (info != 0) {
|
||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
|
||||
*Info = - info;
|
||||
return 0;
|
||||
}
|
||||
|
||||
args.alpha = NULL;
|
||||
args.beta = NULL;
|
||||
|
||||
*Info = 0;
|
||||
|
||||
if (args.m == 0) return 0;
|
||||
|
||||
if (diag) {
|
||||
if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
|
||||
*Info = IAMIN_K(args.m, args.a, args.lda + 1);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
||||
#ifndef PPC440
|
||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||
|
||||
sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
#endif
|
||||
|
||||
#ifdef SMP
|
||||
args.common = NULL;
|
||||
args.nthreads = num_cpu_avail(4);
|
||||
|
||||
if (args.nthreads == 1) {
|
||||
#endif
|
||||
|
||||
(trtrs_single[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
|
||||
#ifdef SMP
|
||||
} else {
|
||||
(trtrs_parallel[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef PPC440
|
||||
blas_memory_free(buffer);
|
||||
#endif
|
||||
|
||||
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
|
||||
|
||||
IDEBUG_END;
|
||||
|
||||
return 0;
|
||||
|
||||
}
|
|
@ -5,6 +5,11 @@ endif
|
|||
TOPDIR = ..
|
||||
include $(TOPDIR)/Makefile.system
|
||||
|
||||
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
endif
|
||||
|
||||
AVX2OPT =
|
||||
ifeq ($(C_COMPILER), GCC)
|
||||
# AVX2 support was added in 4.7.0
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
USE_GEMM3M = 0
|
||||
OS := $(shell uname)
|
||||
|
||||
ifeq ($(ARCH), x86)
|
||||
USE_GEMM3M = 1
|
||||
|
@ -24,9 +25,11 @@ ifeq ($(TARGET), LOONGSON3B)
|
|||
USE_TRMM = 1
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), GENERIC)
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
|
@ -57,8 +60,6 @@ USE_TRMM = 1
|
|||
endif
|
||||
|
||||
|
||||
|
||||
|
||||
SKERNELOBJS += \
|
||||
sgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
|
||||
|
@ -436,7 +437,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
|
|||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
|
||||
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
|
||||
rm sgemmotcopy.s sgemmotcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
|
||||
|
@ -444,12 +453,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
|
|||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
|
||||
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
|
||||
rm sgemmitcopy.s sgemmitcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
|
||||
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
|
||||
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
@ -460,7 +483,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
|
|||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
|
||||
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
|
||||
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
@ -496,7 +526,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
|
|||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
|
||||
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
|
||||
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
@ -512,7 +549,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
|
|||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
|
||||
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
|
||||
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
|
@ -537,37 +581,107 @@ endif
|
|||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
|
||||
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
|
||||
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
|
||||
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
|
||||
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
|
||||
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
|
||||
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
|
||||
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
|
||||
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
|
||||
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
|
||||
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
|
||||
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
|
||||
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
|
||||
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
|
||||
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
|
||||
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
|
||||
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
|
||||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
|
||||
|
@ -584,28 +698,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
|
|||
|
||||
ifdef USE_TRMM
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
|
||||
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
|
||||
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
|
||||
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
|
||||
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
|
||||
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
|
||||
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
|
||||
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
|
||||
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
|
||||
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
|
||||
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
|
||||
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
|
||||
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
@ -620,52 +790,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
|
|||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
|
||||
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
|
||||
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
|
||||
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
|
||||
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
|
||||
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
|
||||
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
|
||||
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
|
||||
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
|
||||
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
|
||||
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
|
||||
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
|
||||
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
|
||||
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
|
||||
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
|
||||
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
|
||||
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
|
||||
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
|
||||
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
|
||||
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
|
||||
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
|
||||
endif
|
||||
|
||||
else
|
||||
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
@ -677,7 +960,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
|||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
@ -804,7 +1094,14 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
|
|||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
|
||||
|
||||
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
|
||||
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
|
||||
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
|
||||
else
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
|
||||
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
|
||||
|
@ -1940,7 +2237,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
|
|||
|
||||
endif
|
||||
|
||||
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
|
||||
|
@ -2044,7 +2341,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
|
|||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
|
||||
|
||||
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
|
||||
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
|
||||
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
|
||||
|
@ -2083,7 +2387,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
|||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
|
||||
|
||||
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
|
||||
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
|
||||
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
|
||||
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
|
||||
|
|
|
@ -91,12 +91,10 @@ IDAMAXKERNEL = iamax.S
|
|||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
ifneq ($(OS_DARWIN)$(CROSS),11)
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
endif
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = dot.S
|
||||
|
@ -104,48 +102,35 @@ CDOTKERNEL = zdot.S
|
|||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
ifeq ($(OS_DARWIN)$(CROSS),11)
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
else
|
||||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
@ -202,5 +187,3 @@ ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
|||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
endif
|
||||
|
|
|
@ -109,13 +109,29 @@ ZGEMVTKERNEL = zgemv_t.S
|
|||
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
|
||||
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
ifeq ($(SGEMM_UNROLL_M), 16)
|
||||
SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_M), 4)
|
||||
SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S
|
||||
else
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
endif
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
ifeq ($(SGEMM_UNROLL_N), 16)
|
||||
SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S
|
||||
else
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
endif
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
|
|
@ -0,0 +1,252 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define BETA d0
|
||||
#define LDC x6
|
||||
#define C00 x7
|
||||
|
||||
#define A01 x8
|
||||
#define A02 x9
|
||||
#define A03 x10
|
||||
#define A04 x11
|
||||
|
||||
#define beta0 d11
|
||||
#define betaV0 v11.d[0]
|
||||
#define I x16
|
||||
|
||||
#define prfm_size 640
|
||||
#define calc_size 128
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.2d, v0.2d, betaV0
|
||||
fmul v1.2d, v1.2d, betaV0
|
||||
fmul v2.2d, v2.2d, betaV0
|
||||
fmul v3.2d, v3.2d, betaV0
|
||||
fmul v4.2d, v4.2d, betaV0
|
||||
fmul v5.2d, v5.2d, betaV0
|
||||
fmul v6.2d, v6.2d, betaV0
|
||||
fmul v7.2d, v7.2d, betaV0
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
ldr LDC, [sp]
|
||||
SAVE_REGS
|
||||
|
||||
.Lgemm_beta_BEGIN:
|
||||
|
||||
fmov beta0, BETA
|
||||
cmp N, #0
|
||||
ble .Lgemm_beta_L999
|
||||
|
||||
fcmp BETA, #0.0
|
||||
beq .Lgemm_beta_zero_01
|
||||
|
||||
.Lgemm_beta_01:
|
||||
|
||||
lsl LDC, LDC, #3
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
asr I, M, #4
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_03:
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
|
||||
fmul v0.2d, v0.2d, betaV0
|
||||
fmul v1.2d, v1.2d, betaV0
|
||||
|
||||
fmul v2.2d, v2.2d, betaV0
|
||||
fmul v3.2d, v3.2d, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size]
|
||||
|
||||
fmul v4.2d, v4.2d, betaV0
|
||||
fmul v5.2d, v5.2d, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A03, prfm_size]
|
||||
|
||||
fmul v6.2d, v6.2d, betaV0
|
||||
fmul v7.2d, v7.2d, betaV0
|
||||
|
||||
st1 {v0.2d, v1.2d}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.2d, v3.2d}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.2d, v5.2d}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.2d, v7.2d}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_04:
|
||||
|
||||
and I, M , #15 // M%16
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_05:
|
||||
|
||||
ldr d12, [A01]
|
||||
fmul d12, d12, beta0
|
||||
str d12, [A01]
|
||||
add A01, A01, #8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_06:
|
||||
|
||||
subs N , N, #1 // N--
|
||||
bne .Lgemm_beta_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
.Lgemm_beta_zero_01:
|
||||
INIT_ZERO
|
||||
lsl LDC, LDC, #3
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_02:
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
|
||||
asr I, M, #4
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_04
|
||||
|
||||
add A02, A01, #64
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_03:
|
||||
|
||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [A02]
|
||||
add A02, A02, calc_size
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_04:
|
||||
|
||||
and I, M, #15
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_05:
|
||||
|
||||
str beta0, [A01]
|
||||
add A01, A01, #8
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_06:
|
||||
|
||||
subs N, N, #1
|
||||
bne .Lgemm_beta_zero_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -54,37 +54,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_X_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_X_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_X_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_X_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
|
|
@ -0,0 +1,259 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define BETA s0
|
||||
#define LDC x6
|
||||
#define C00 x7
|
||||
|
||||
#define A01 x8
|
||||
#define A02 x9
|
||||
#define A03 x10
|
||||
#define A04 x11
|
||||
#define I x12
|
||||
|
||||
#define beta0 s11
|
||||
#define betaV0 v11.s[0]
|
||||
|
||||
#define prfm_size 640
|
||||
#define calc_size 128
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro INIT_ZERO
|
||||
fmul v0.4s, v0.4s, betaV0
|
||||
fmul v1.4s, v1.4s, betaV0
|
||||
fmul v2.4s, v2.4s, betaV0
|
||||
fmul v3.4s, v3.4s, betaV0
|
||||
fmul v4.4s, v4.4s, betaV0
|
||||
fmul v5.4s, v5.4s, betaV0
|
||||
fmul v6.4s, v6.4s, betaV0
|
||||
fmul v7.4s, v7.4s, betaV0
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
ldr LDC, [sp]
|
||||
SAVE_REGS
|
||||
|
||||
.Lgemm_beta_BEGIN:
|
||||
|
||||
fmov beta0, BETA
|
||||
cmp N, #0
|
||||
ble .Lgemm_beta_L999
|
||||
|
||||
fcmp BETA, #0.0
|
||||
beq .Lgemm_beta_zero_01
|
||||
|
||||
.Lgemm_beta_01:
|
||||
|
||||
lsl LDC, LDC, #2
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
asr I, M, #5
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_03:
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
|
||||
fmul v0.4s, v0.4s, betaV0
|
||||
fmul v1.4s, v1.4s, betaV0
|
||||
|
||||
fmul v2.4s, v2.4s, betaV0
|
||||
fmul v3.4s, v3.4s, betaV0
|
||||
|
||||
fmul v4.4s, v4.4s, betaV0
|
||||
fmul v5.4s, v5.4s, betaV0
|
||||
|
||||
fmul v6.4s, v6.4s, betaV0
|
||||
fmul v7.4s, v7.4s, betaV0
|
||||
|
||||
prfm PLDL1KEEP, [A01, prfm_size + 64]
|
||||
|
||||
st1 {v0.4s, v1.4s}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.4s, v3.4s}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.4s, v5.4s}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.4s, v7.4s}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_04:
|
||||
|
||||
and I, M , #31
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_05:
|
||||
|
||||
ldr s12, [A01]
|
||||
fmul s12, s12, beta0
|
||||
str s12, [A01]
|
||||
add A01, A01, #4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lgemm_beta_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_06:
|
||||
|
||||
subs N , N, #1 // N--
|
||||
bne .Lgemm_beta_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_01:
|
||||
|
||||
INIT_ZERO
|
||||
lsl LDC, LDC, #2
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_02:
|
||||
|
||||
mov A01, C00
|
||||
add C00, C00, LDC
|
||||
|
||||
asr I, M, #5
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_04
|
||||
add A02, A01, #32
|
||||
add A03, A02, #32
|
||||
add A04, A03, #32
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_03:
|
||||
|
||||
st1 {v0.4s, v1.4s}, [A01]
|
||||
add A01, A01, calc_size
|
||||
st1 {v2.4s, v3.4s}, [A02]
|
||||
add A02, A02, calc_size
|
||||
st1 {v4.4s, v5.4s}, [A03]
|
||||
add A03, A03, calc_size
|
||||
st1 {v6.4s, v7.4s}, [A04]
|
||||
add A04, A04, calc_size
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_03
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_04:
|
||||
|
||||
and I, M, #31
|
||||
cmp I, #0
|
||||
ble .Lgemm_beta_zero_06
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_05:
|
||||
|
||||
str beta0, [A01]
|
||||
add A01, A01, #4
|
||||
|
||||
subs I, I, #1
|
||||
bne .Lgemm_beta_zero_05
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_06:
|
||||
|
||||
subs N, N, #1
|
||||
bne .Lgemm_beta_zero_02
|
||||
|
||||
.align 5
|
||||
.Lgemm_beta_zero_L999:
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,333 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A00 x2
|
||||
#define LDA x3
|
||||
#define B00 x4
|
||||
|
||||
#define A01 x5
|
||||
#define A02 x6
|
||||
#define A03 x7
|
||||
#define A04 x8
|
||||
|
||||
#define I x9
|
||||
#define J x10
|
||||
|
||||
#define TEMP1 x11
|
||||
#define TEMP2 x12
|
||||
|
||||
#define A_PREFETCH 2560
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
ldr q2, [A03], #16
|
||||
ins v8.s[2], v2.s[0]
|
||||
ins v9.s[2], v2.s[1]
|
||||
ins v10.s[2], v2.s[2]
|
||||
ins v11.s[2], v2.s[3]
|
||||
|
||||
ldr q3, [A04], #16
|
||||
ins v8.s[3], v3.s[0]
|
||||
ins v9.s[3], v3.s[1]
|
||||
ins v10.s[3], v3.s[2]
|
||||
ins v11.s[3], v3.s[3]
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
|
||||
add B00, B00, #64
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
ldr s2, [A03], #4
|
||||
ldr s3, [A04], #4
|
||||
|
||||
stp s0, s1, [B00]
|
||||
add B00, B00, #8
|
||||
stp s2, s3, [B00]
|
||||
add B00, B00, #8
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
ins v8.s[0], v0.s[0]
|
||||
ins v9.s[0], v0.s[1]
|
||||
ins v10.s[0], v0.s[2]
|
||||
ins v11.s[0], v0.s[3]
|
||||
|
||||
ldr q1, [A02], #16
|
||||
ins v8.s[1], v1.s[0]
|
||||
ins v9.s[1], v1.s[1]
|
||||
ins v10.s[1], v1.s[2]
|
||||
ins v11.s[1], v1.s[3]
|
||||
|
||||
st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
|
||||
add B00, B00, #32
|
||||
.endm
|
||||
|
||||
|
||||
.macro COPY1x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
ldr s1, [A02], #4
|
||||
|
||||
stp s0, s1, [B00]
|
||||
add B00, B00, #8
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01], #16
|
||||
str q0, [B00], #16
|
||||
.endm
|
||||
|
||||
|
||||
.macro COPY1x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01], #4
|
||||
str s0, [B00], #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
.Ldgemm_ncopy_L4_BEGIN:
|
||||
|
||||
asr J, N, #2 // J = N / 4
|
||||
cmp J, #0
|
||||
ble .Ldgemm_ncopy_L2_BEGIN
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A00, A04, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L4_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_20:
|
||||
|
||||
COPY4x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L4_M4_20
|
||||
|
||||
.Ldgemm_ncopy_L4_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L4_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L4_M4_60:
|
||||
|
||||
COPY1x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L4_M4_60
|
||||
|
||||
.Ldgemm_ncopy_L4_M4_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Ldgemm_ncopy_L4_M4_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Ldgemm_ncopy_L2_BEGIN:
|
||||
|
||||
tst N, #3
|
||||
ble .Ldgemm_ncopy_L999
|
||||
|
||||
tst N, #2
|
||||
ble .Ldgemm_ncopy_L1_BEGIN
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_BEGIN:
|
||||
mov A01, A00
|
||||
add A02, A01, LDA
|
||||
add A00, A02, LDA
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L2_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L2_M4_20:
|
||||
|
||||
COPY4x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L2_M4_20
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L2_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L2_M4_60:
|
||||
|
||||
COPY1x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L2_M4_60
|
||||
|
||||
.Ldgemm_ncopy_L2_M4_END:
|
||||
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Ldgemm_ncopy_L1_BEGIN:
|
||||
|
||||
tst N, #1
|
||||
ble .Ldgemm_ncopy_L999
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_BEGIN:
|
||||
|
||||
mov A01, A00
|
||||
|
||||
asr I, M, #2 // I = M / 4
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L1_M4_40
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L1_M4_20:
|
||||
|
||||
COPY4x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L1_M4_20
|
||||
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_40:
|
||||
|
||||
and I, M , #3
|
||||
cmp I, #0
|
||||
ble .Ldgemm_ncopy_L1_M4_END
|
||||
|
||||
.align 5
|
||||
.Ldgemm_ncopy_L1_M4_60:
|
||||
|
||||
COPY1x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Ldgemm_ncopy_L1_M4_60
|
||||
|
||||
|
||||
.Ldgemm_ncopy_L1_M4_END:
|
||||
|
||||
.Ldgemm_ncopy_L999:
|
||||
|
||||
mov x0, #0
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,824 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2019, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
*****************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0
|
||||
#define N x1
|
||||
#define A x2
|
||||
#define LDA x3
|
||||
#define B x4
|
||||
|
||||
#define M8 x5
|
||||
|
||||
#define A01 x6
|
||||
#define A02 x7
|
||||
#define A03 x8
|
||||
#define A04 x9
|
||||
#define A05 x10
|
||||
#define A06 x11
|
||||
#define A07 x12
|
||||
#define A08 x13
|
||||
|
||||
#define B01 x14
|
||||
#define B02 x15
|
||||
#define B03 x16
|
||||
#define B04 x17
|
||||
#define B00 x22
|
||||
|
||||
|
||||
#define I x18
|
||||
#define J x19
|
||||
|
||||
#define TEMP1 x20
|
||||
|
||||
#define A_PREFETCH 256
|
||||
|
||||
/**************************************************************************************
|
||||
* Macro definitions
|
||||
**************************************************************************************/
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
//prfm PSTL1KEEP, [B00, M8]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
|
||||
add A03, A03, #64
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
|
||||
add A04, A04, #64
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
|
||||
add A05, A05, #64
|
||||
|
||||
st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
|
||||
add A06, A06, #64
|
||||
|
||||
st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
|
||||
add A07, A07, #64
|
||||
|
||||
st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
|
||||
add A08, A08, #64
|
||||
|
||||
st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
add B00, B00, M8
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY8x8
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q8, q9, [A05]
|
||||
ldp q10, q11, [A06]
|
||||
add A05, A05, #32
|
||||
add A06, A06, #32
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q12, q13, [A07]
|
||||
ldp q14, q15, [A08]
|
||||
add A07, A07, #32
|
||||
add A08, A08, #32
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
|
||||
add B02, B02, #64
|
||||
|
||||
ldr q4, [A05]
|
||||
ldr q5, [A06]
|
||||
ldr q6, [A07]
|
||||
ldr q7, [A08]
|
||||
|
||||
add A05, A05, #16
|
||||
add A06, A06, #16
|
||||
add A07, A07, #16
|
||||
add A08, A08, #16
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
|
||||
add B02, B02, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
stp d2, d3, [B03]
|
||||
add B03, B03, #16
|
||||
|
||||
ldr d4, [A05]
|
||||
ldr d5, [A06]
|
||||
ldr d6, [A07]
|
||||
ldr d7, [A08]
|
||||
|
||||
add A05, A05, #8
|
||||
add A06, A06, #8
|
||||
add A07, A07, #8
|
||||
add A08, A08, #8
|
||||
|
||||
stp d4, d5, [B03]
|
||||
add B03, B03, #16
|
||||
stp d6, d7, [B03]
|
||||
add B03, B03, #16
|
||||
|
||||
.endm
|
||||
|
||||
.macro COPY1x8
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
add B04, B04, #8
|
||||
stp s2, s3, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
ldr s4, [A05]
|
||||
ldr s5, [A06]
|
||||
ldr s6, [A07]
|
||||
ldr s7, [A08]
|
||||
|
||||
ldr d4, [A05], #8
|
||||
ldr d5, [A06], #8
|
||||
ldr d6, [A07], #8
|
||||
ldr d7, [A08], #8
|
||||
|
||||
stp s4, s5, [B04]
|
||||
add B04, B04, #8
|
||||
stp s6, s7, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
.macro COPY16x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
|
||||
add A03, A03, #64
|
||||
|
||||
st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
|
||||
add TEMP1, TEMP1, #64
|
||||
|
||||
ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
|
||||
add A04, A04, #64
|
||||
|
||||
st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
|
||||
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x4
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
ldp q2, q3, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
|
||||
ldp q4, q5, [A03]
|
||||
ldp q6, q7, [A04]
|
||||
add A03, A03, #32
|
||||
add A04, A04, #32
|
||||
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
ldr q2, [A03]
|
||||
ldr q3, [A04]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
add A03, A03, #16
|
||||
add A04, A04, #16
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
|
||||
|
||||
add B02, B02, #64
|
||||
.endm
|
||||
|
||||
.macro COPY2x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
ldr d2, [A03]
|
||||
ldr d3, [A04]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
add A03, A03, #8
|
||||
add A04, A04, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
stp d2, d3, [B03]
|
||||
|
||||
add B03, B03, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x4
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
ldr s2, [A03]
|
||||
ldr s3, [A04]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
add A03, A03, #4
|
||||
add A04, A04, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
add B04, B04, #8
|
||||
stp s2, s3, [B04]
|
||||
add B04, B04, #8
|
||||
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
|
||||
add A02, A02, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add TEMP1, B00, #64
|
||||
st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x2
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s}, [A01]
|
||||
ld1 {v2.4s, v3.4s}, [A02]
|
||||
add A01, A01, #32
|
||||
add A02, A02, #32
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
|
||||
add B01, B01, #64
|
||||
.endm
|
||||
|
||||
.macro COPY4x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
ldr q1, [A02]
|
||||
add A01, A01, #16
|
||||
add A02, A02, #16
|
||||
|
||||
stp q0, q1, [B02]
|
||||
add B02, B02, #32
|
||||
.endm
|
||||
|
||||
.macro COPY2x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
ldr d1, [A02]
|
||||
|
||||
add A01, A01, #8
|
||||
add A02, A02, #8
|
||||
|
||||
stp d0, d1, [B03]
|
||||
add B03, B03, #16
|
||||
.endm
|
||||
|
||||
.macro COPY1x2
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
ldr s1, [A02]
|
||||
|
||||
add A01, A01, #4
|
||||
add A02, A02, #4
|
||||
|
||||
stp s0, s1, [B04]
|
||||
|
||||
add B04, B04, #8
|
||||
.endm
|
||||
|
||||
/*************************************************************************************************************************/
|
||||
|
||||
.macro COPY16x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
|
||||
add A01, A01, #64
|
||||
|
||||
st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
|
||||
add B00, B00, M8
|
||||
.endm
|
||||
|
||||
.macro COPY8x1
|
||||
prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldp q0, q1, [A01]
|
||||
add A01, A01, #32
|
||||
stp q0, q1, [B01]
|
||||
|
||||
add B01, B01, #32
|
||||
.endm
|
||||
|
||||
.macro COPY4x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr q0, [A01]
|
||||
add A01, A01, #16
|
||||
str q0, [B02]
|
||||
|
||||
add B02, B02, #16
|
||||
.endm
|
||||
|
||||
.macro COPY2x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr d0, [A01]
|
||||
add A01, A01, #8
|
||||
str d0, [B03]
|
||||
|
||||
add B03, B03, #8
|
||||
.endm
|
||||
|
||||
.macro COPY1x1
|
||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH]
|
||||
|
||||
ldr s0, [A01]
|
||||
add A01, A01, #4
|
||||
str s0, [B04]
|
||||
|
||||
add B04, B04, #4
|
||||
.endm
|
||||
|
||||
/**************************************************************************************
|
||||
* End of macro definitions
|
||||
**************************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.align 5
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
lsl LDA, LDA, #2 // LDA = LDA * SIZE
|
||||
|
||||
lsl TEMP1, M, #2 // TEMP1 = M * SIZE
|
||||
|
||||
and B01 , N , #-16
|
||||
and B02 , N , #-8
|
||||
and B03 , N , #-4
|
||||
and B04 , N , #-2
|
||||
|
||||
mul B01, B01, TEMP1
|
||||
mul B02, B02, TEMP1
|
||||
mul B03, B03, TEMP1
|
||||
mul B04, B04, TEMP1
|
||||
|
||||
add B01 , B01, B
|
||||
add B02 , B02, B
|
||||
add B03 , B03, B
|
||||
add B04 , B04, B
|
||||
|
||||
lsl M8, M, #6 // M8 = M * 16 * SIZE
|
||||
|
||||
.Lsgemm_tcopy_L8_BEGIN:
|
||||
asr J, M, #3 // J = M / 8
|
||||
cmp J, #0
|
||||
ble .Lsgemm_tcopy_L4_BEGIN
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A05, A04, LDA
|
||||
add A06, A05, LDA
|
||||
add A07, A06, LDA
|
||||
add A08, A07, LDA
|
||||
add A, A08, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #512 // B = B + 8 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L8_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L8_M16_20:
|
||||
|
||||
COPY16x8
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L8_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L8_M16_60
|
||||
|
||||
COPY8x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L8_M16_80
|
||||
|
||||
COPY4x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L8_M16_100
|
||||
|
||||
COPY2x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_100:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L8_M16_END
|
||||
|
||||
COPY1x8
|
||||
|
||||
.Lsgemm_tcopy_L8_M16_END:
|
||||
|
||||
subs J , J, #1 // j--
|
||||
bne .Lsgemm_tcopy_L8_M16_BEGIN
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L4_BEGIN:
|
||||
tst M, #7
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #4
|
||||
ble .Lsgemm_tcopy_L2_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_BEGIN:
|
||||
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A03, A02, LDA
|
||||
add A04, A03, LDA
|
||||
add A, A04, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #256 // B = B + 4 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L4_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L4_M16_20:
|
||||
|
||||
COPY16x4
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L4_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L4_M16_60
|
||||
|
||||
COPY8x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L4_M16_80
|
||||
|
||||
COPY4x4
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L4_M16_100
|
||||
|
||||
COPY2x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_100:
|
||||
|
||||
tst N, #1
|
||||
ble .Lsgemm_tcopy_L4_M16_END
|
||||
|
||||
COPY1x4
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L4_M16_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L2_BEGIN:
|
||||
|
||||
tst M, #3
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
tst M, #2
|
||||
ble .Lsgemm_tcopy_L1_BEGIN
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_BEGIN:
|
||||
mov A01, A
|
||||
add A02, A01, LDA
|
||||
add A, A02, LDA
|
||||
|
||||
mov B00, B
|
||||
add B, B00, #128 // B = B + 2 * 16 * SIZE
|
||||
|
||||
asr I, N, #4 // I = N / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L2_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L2_M16_20:
|
||||
|
||||
COPY16x2
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L2_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L2_M16_60
|
||||
|
||||
COPY8x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L2_M16_80
|
||||
|
||||
COPY4x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L2_M16_100
|
||||
|
||||
COPY2x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_100:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L2_M16_END
|
||||
|
||||
COPY1x2
|
||||
|
||||
.Lsgemm_tcopy_L2_M16_END:
|
||||
|
||||
/*********************************************************************************************/
|
||||
|
||||
.Lsgemm_tcopy_L1_BEGIN:
|
||||
|
||||
tst M, #1
|
||||
ble .Lsgemm_tcopy_L999
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_BEGIN:
|
||||
|
||||
mov A01, A // A01 = A
|
||||
mov B00, B
|
||||
|
||||
asr I, N, #4 // I = M / 16
|
||||
cmp I, #0
|
||||
ble .Lsgemm_tcopy_L1_M16_40
|
||||
|
||||
.align 5
|
||||
.Lsgemm_tcopy_L1_M16_20:
|
||||
|
||||
COPY16x1
|
||||
|
||||
subs I , I , #1
|
||||
bne .Lsgemm_tcopy_L1_M16_20
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_40:
|
||||
tst N , #8
|
||||
ble .Lsgemm_tcopy_L1_M16_60
|
||||
|
||||
COPY8x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_60:
|
||||
tst N , #4
|
||||
ble .Lsgemm_tcopy_L1_M16_80
|
||||
|
||||
COPY4x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_80:
|
||||
|
||||
tst N , #2
|
||||
ble .Lsgemm_tcopy_L1_M16_100
|
||||
|
||||
COPY2x1
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_100:
|
||||
|
||||
tst N , #1
|
||||
ble .Lsgemm_tcopy_L1_M16_END
|
||||
|
||||
COPY1x1
|
||||
|
||||
|
||||
.Lsgemm_tcopy_L1_M16_END:
|
||||
|
||||
.Lsgemm_tcopy_L999:
|
||||
mov x0, #0 // set return value
|
||||
RESTORE_REGS
|
||||
ret
|
||||
|
||||
EPILOGUE
|
||||
|
||||
|
|
@ -54,138 +54,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if !defined(DOUBLE)
|
||||
ldr s4, [X], #4
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr s5, [X], #4
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X], #8
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_F1_NEXT_\@
|
||||
beq 2f /* KERNEL_F1_NEXT_\@ */
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_F1_SCALE_GE_XR_\@
|
||||
bge 1f /* KERNEL_F1_SCALE_GE_XR_\@ */
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_F1_NEXT_\@
|
||||
KERNEL_F1_SCALE_GE_XR_\@:
|
||||
b 2f /* KERNEL_F1_NEXT_\@ */
|
||||
1: /* KERNEL_F1_SCALE_GE_XR_\@: */
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_F1_NEXT_\@:
|
||||
2: /* KERNEL_F1_NEXT_\@: */
|
||||
ldr d5, [X], #8
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_F1_END_\@
|
||||
beq 4f /* KERNEL_F1_END_\@ */
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_F1_SCALE_GE_XI_\@
|
||||
bge 3f /* KERNEL_F1_SCALE_GE_XI_\@ */
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_F1_END_\@
|
||||
KERNEL_F1_SCALE_GE_XI_\@:
|
||||
b 4f /* KERNEL_F1_END_\@ */
|
||||
3: /* KERNEL_F1_SCALE_GE_XI_\@: */
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_F1_END_\@:
|
||||
4: /* KERNEL_F1_END_\@: */
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ldr s4, [X]
|
||||
fcmp s4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs s4, s4
|
||||
fcmp SCALE, s4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv s2, SCALE, s4
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv s2, s4, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr s5, [X, #4]
|
||||
fcmp s5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs s5, s5
|
||||
fcmp SCALE, s5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv s2, SCALE, s5
|
||||
fmul s2, s2, s2
|
||||
fmul s3, SSQ, s2
|
||||
fadd SSQ, REGONE, s3
|
||||
fmov SCALE, s5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv s2, s5, SCALE
|
||||
fmla SSQ, s2, v2.s[0]
|
||||
#else
|
||||
ldr d4, [X]
|
||||
fcmp d4, REGZERO
|
||||
beq KERNEL_S1_NEXT_\@
|
||||
beq KERNEL_S1_NEXT
|
||||
fabs d4, d4
|
||||
fcmp SCALE, d4
|
||||
bge KERNEL_S1_SCALE_GE_XR_\@
|
||||
bge KERNEL_S1_SCALE_GE_XR
|
||||
fdiv d2, SCALE, d4
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d4
|
||||
b KERNEL_S1_NEXT_\@
|
||||
KERNEL_S1_SCALE_GE_XR_\@:
|
||||
b KERNEL_S1_NEXT
|
||||
KERNEL_S1_SCALE_GE_XR:
|
||||
fdiv d2, d4, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
KERNEL_S1_NEXT_\@:
|
||||
KERNEL_S1_NEXT:
|
||||
ldr d5, [X, #8]
|
||||
fcmp d5, REGZERO
|
||||
beq KERNEL_S1_END_\@
|
||||
beq KERNEL_S1_END
|
||||
fabs d5, d5
|
||||
fcmp SCALE, d5
|
||||
bge KERNEL_S1_SCALE_GE_XI_\@
|
||||
bge KERNEL_S1_SCALE_GE_XI
|
||||
fdiv d2, SCALE, d5
|
||||
fmul d2, d2, d2
|
||||
fmul d3, SSQ, d2
|
||||
fadd SSQ, REGONE, d3
|
||||
fmov SCALE, d5
|
||||
b KERNEL_S1_END_\@
|
||||
KERNEL_S1_SCALE_GE_XI_\@:
|
||||
b KERNEL_S1_END
|
||||
KERNEL_S1_SCALE_GE_XI:
|
||||
fdiv d2, d5, SCALE
|
||||
fmla SSQ, d2, v2.d[0]
|
||||
#endif
|
||||
KERNEL_S1_END_\@:
|
||||
KERNEL_S1_END:
|
||||
add X, X, INC_X
|
||||
.endm
|
||||
|
||||
|
|
|
@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
|
|||
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
|
||||
FLOAT *c, BLASLONG ldc){
|
||||
|
||||
|
||||
BLASLONG i, j;
|
||||
BLASLONG chunk, remain;
|
||||
FLOAT *c_offset1, *c_offset;
|
||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||
|
||||
c_offset = c;
|
||||
|
||||
chunk = m >> 3;
|
||||
remain = m & 7;
|
||||
if (beta == ZERO){
|
||||
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do {
|
||||
*(c_offset1 + 0) = ZERO;
|
||||
*(c_offset1 + 1) = ZERO;
|
||||
*(c_offset1 + 2) = ZERO;
|
||||
*(c_offset1 + 3) = ZERO;
|
||||
*(c_offset1 + 4) = ZERO;
|
||||
*(c_offset1 + 5) = ZERO;
|
||||
*(c_offset1 + 6) = ZERO;
|
||||
*(c_offset1 + 7) = ZERO;
|
||||
c_offset1 += 8;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i > 0){
|
||||
do {
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1 ++;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
j --;
|
||||
} while (j > 0);
|
||||
|
||||
for(j=n; j>0; j--){
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for(i=chunk; i>0; i--){
|
||||
*(c_offset1 + 0) = ZERO;
|
||||
*(c_offset1 + 1) = ZERO;
|
||||
*(c_offset1 + 2) = ZERO;
|
||||
*(c_offset1 + 3) = ZERO;
|
||||
*(c_offset1 + 4) = ZERO;
|
||||
*(c_offset1 + 5) = ZERO;
|
||||
*(c_offset1 + 6) = ZERO;
|
||||
*(c_offset1 + 7) = ZERO;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for(i=remain; i>0; i--){
|
||||
*c_offset1 = ZERO;
|
||||
c_offset1 ++;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
j = n;
|
||||
do {
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
|
||||
i = (m >> 3);
|
||||
if (i > 0){
|
||||
do {
|
||||
ctemp1 = *(c_offset1 + 0);
|
||||
ctemp2 = *(c_offset1 + 1);
|
||||
ctemp3 = *(c_offset1 + 2);
|
||||
ctemp4 = *(c_offset1 + 3);
|
||||
ctemp5 = *(c_offset1 + 4);
|
||||
ctemp6 = *(c_offset1 + 5);
|
||||
ctemp7 = *(c_offset1 + 6);
|
||||
ctemp8 = *(c_offset1 + 7);
|
||||
|
||||
ctemp1 *= beta;
|
||||
ctemp2 *= beta;
|
||||
ctemp3 *= beta;
|
||||
ctemp4 *= beta;
|
||||
ctemp5 *= beta;
|
||||
ctemp6 *= beta;
|
||||
ctemp7 *= beta;
|
||||
ctemp8 *= beta;
|
||||
|
||||
*(c_offset1 + 0) = ctemp1;
|
||||
*(c_offset1 + 1) = ctemp2;
|
||||
*(c_offset1 + 2) = ctemp3;
|
||||
*(c_offset1 + 3) = ctemp4;
|
||||
*(c_offset1 + 4) = ctemp5;
|
||||
*(c_offset1 + 5) = ctemp6;
|
||||
*(c_offset1 + 6) = ctemp7;
|
||||
*(c_offset1 + 7) = ctemp8;
|
||||
c_offset1 += 8;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
i = (m & 7);
|
||||
if (i > 0){
|
||||
do {
|
||||
ctemp1 = *c_offset1;
|
||||
ctemp1 *= beta;
|
||||
*c_offset1 = ctemp1;
|
||||
c_offset1 ++;
|
||||
i --;
|
||||
} while (i > 0);
|
||||
}
|
||||
j --;
|
||||
} while (j > 0);
|
||||
|
||||
for(j=n; j>0; j--){
|
||||
c_offset1 = c_offset;
|
||||
c_offset += ldc;
|
||||
for(i=chunk; i>0; i--){
|
||||
*(c_offset1 + 0) *= beta;
|
||||
*(c_offset1 + 1) *= beta;
|
||||
*(c_offset1 + 2) *= beta;
|
||||
*(c_offset1 + 3) *= beta;
|
||||
*(c_offset1 + 4) *= beta;
|
||||
*(c_offset1 + 5) *= beta;
|
||||
*(c_offset1 + 6) *= beta;
|
||||
*(c_offset1 + 7) *= beta;
|
||||
c_offset1 += 8;
|
||||
}
|
||||
for(i=remain; i>0; i--){
|
||||
*c_offset1 *= beta;
|
||||
c_offset1 ++;
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
};
|
||||
|
|
|
@ -89,14 +89,30 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMAXKERNEL = isamax_power8.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMAXKERNEL = icamax_power8.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ISAMINKERNEL = isamin_power8.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
ICAMINKERNEL = icamin_power8.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
|
@ -112,7 +128,11 @@ ZASUMKERNEL = zasum.c
|
|||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = caxpy_power8.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
|
|
|
@ -1,184 +1,208 @@
|
|||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power9.S
|
||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||
CTRMMKERNEL = cgemm_kernel_power9.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power9.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power9.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power9.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power9.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
#SAMAXKERNEL = ../arm/amax.c
|
||||
#DAMAXKERNEL = ../arm/amax.c
|
||||
#CAMAXKERNEL = ../arm/zamax.c
|
||||
#ZAMAXKERNEL = ../arm/zamax.c
|
||||
#
|
||||
#SAMINKERNEL = ../arm/amin.c
|
||||
#DAMINKERNEL = ../arm/amin.c
|
||||
#CAMINKERNEL = ../arm/zamin.c
|
||||
#ZAMINKERNEL = ../arm/zamin.c
|
||||
#
|
||||
#SMAXKERNEL = ../arm/max.c
|
||||
#DMAXKERNEL = ../arm/max.c
|
||||
#
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ISAMAXKERNEL = isamax.c
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ICAMAXKERNEL = icamax.c
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ISAMINKERNEL = isamin.c
|
||||
IDAMINKERNEL = idamin.c
|
||||
ICAMINKERNEL = icamin.c
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
#
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
CAXPYKERNEL = caxpy.c
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
CDOTKERNEL = cdot.c
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
#SGEMM_BETA = ../generic/gemm_beta.c
|
||||
#DGEMM_BETA = ../generic/gemm_beta.c
|
||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power9.S
|
||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||
CTRMMKERNEL = cgemm_kernel_power9.S
|
||||
ZTRMMKERNEL = zgemm_kernel_power9.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_power9.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = sgemm_tcopy_16_power8.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_power8.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_power9.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
DGEMMITCOPY = dgemm_tcopy_16_power8.S
|
||||
DGEMMONCOPY = dgemm_ncopy_4_power8.S
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_power9.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_power9.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_8_power8.S
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
|
||||
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
|
||||
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
|
||||
|
||||
#Pure C for other kernels
|
||||
#SAMAXKERNEL = ../arm/amax.c
|
||||
#DAMAXKERNEL = ../arm/amax.c
|
||||
#CAMAXKERNEL = ../arm/zamax.c
|
||||
#ZAMAXKERNEL = ../arm/zamax.c
|
||||
#
|
||||
#SAMINKERNEL = ../arm/amin.c
|
||||
#DAMINKERNEL = ../arm/amin.c
|
||||
#CAMINKERNEL = ../arm/zamin.c
|
||||
#ZAMINKERNEL = ../arm/zamin.c
|
||||
#
|
||||
#SMAXKERNEL = ../arm/max.c
|
||||
#DMAXKERNEL = ../arm/max.c
|
||||
#
|
||||
#SMINKERNEL = ../arm/min.c
|
||||
#DMINKERNEL = ../arm/min.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMAXKERNEL = isamax_power9.S
|
||||
else
|
||||
ISAMAXKERNEL = isamax.c
|
||||
endif
|
||||
IDAMAXKERNEL = idamax.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMAXKERNEL = icamax_power9.S
|
||||
else
|
||||
ICAMAXKERNEL = icamax.c
|
||||
endif
|
||||
IZAMAXKERNEL = izamax.c
|
||||
#
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ISAMINKERNEL = isamin_power9.S
|
||||
else
|
||||
ISAMINKERNEL = isamin.c
|
||||
endif
|
||||
IDAMINKERNEL = idamin.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
ICAMINKERNEL = icamin_power9.S
|
||||
else
|
||||
ICAMINKERNEL = icamin.c
|
||||
endif
|
||||
IZAMINKERNEL = izamin.c
|
||||
#
|
||||
#ISMAXKERNEL = ../arm/imax.c
|
||||
#IDMAXKERNEL = ../arm/imax.c
|
||||
#
|
||||
#ISMINKERNEL = ../arm/imin.c
|
||||
#IDMINKERNEL = ../arm/imin.c
|
||||
#
|
||||
SASUMKERNEL = sasum.c
|
||||
DASUMKERNEL = dasum.c
|
||||
CASUMKERNEL = casum.c
|
||||
ZASUMKERNEL = zasum.c
|
||||
#
|
||||
SAXPYKERNEL = saxpy.c
|
||||
DAXPYKERNEL = daxpy.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CAXPYKERNEL = caxpy_power9.S
|
||||
else
|
||||
CAXPYKERNEL = caxpy.c
|
||||
endif
|
||||
ZAXPYKERNEL = zaxpy.c
|
||||
#
|
||||
SCOPYKERNEL = scopy.c
|
||||
DCOPYKERNEL = dcopy.c
|
||||
CCOPYKERNEL = ccopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
#
|
||||
SDOTKERNEL = sdot.c
|
||||
DDOTKERNEL = ddot.c
|
||||
DSDOTKERNEL = sdot.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
CDOTKERNEL = cdot.c
|
||||
endif
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
DNRM2KERNEL = ../arm/nrm2.c
|
||||
CNRM2KERNEL = ../arm/znrm2.c
|
||||
ZNRM2KERNEL = ../arm/znrm2.c
|
||||
#
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
CROTKERNEL = crot.c
|
||||
ZROTKERNEL = zrot.c
|
||||
#
|
||||
SSCALKERNEL = sscal.c
|
||||
DSCALKERNEL = dscal.c
|
||||
CSCALKERNEL = zscal.c
|
||||
ZSCALKERNEL = zscal.c
|
||||
#
|
||||
SSWAPKERNEL = sswap.c
|
||||
DSWAPKERNEL = dswap.c
|
||||
CSWAPKERNEL = cswap.c
|
||||
ZSWAPKERNEL = zswap.c
|
||||
#
|
||||
|
||||
SGEMVNKERNEL = sgemv_n.c
|
||||
DGEMVNKERNEL = dgemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n.c
|
||||
ZGEMVNKERNEL = zgemv_n_4.c
|
||||
#
|
||||
SGEMVTKERNEL = sgemv_t.c
|
||||
DGEMVTKERNEL = dgemv_t.c
|
||||
CGEMVTKERNEL = cgemv_t.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
|
||||
#SSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#SSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#DSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_U_KERNEL = ../generic/symv_k.c
|
||||
#QSYMV_L_KERNEL = ../generic/symv_k.c
|
||||
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
|
||||
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
|
||||
|
||||
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
|
||||
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
|
||||
|
||||
LSAME_KERNEL = ../generic/lsame.c
|
||||
SCABS_KERNEL = ../generic/cabs.c
|
||||
DCABS_KERNEL = ../generic/cabs.c
|
||||
QCABS_KERNEL = ../generic/cabs.c
|
||||
|
||||
#Dump kernel
|
||||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
|
||||
|
|
|
@ -15,13 +15,23 @@ ZASUMKERNEL = zasum_ppc440.S
|
|||
|
||||
SAXPYKERNEL = axpy_ppc440.S
|
||||
DAXPYKERNEL = axpy_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CAXPYKERNEL = ../arm/zaxpy.c
|
||||
ZAXPYKERNEL = ../arm/zaxpy.c
|
||||
else
|
||||
CAXPYKERNEL = zaxpy_ppc440.S
|
||||
ZAXPYKERNEL = zaxpy_ppc440.S
|
||||
endif
|
||||
|
||||
SDOTKERNEL = dot_ppc440.S
|
||||
DDOTKERNEL = dot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CDOTKERNEL = zdot_ppc440.S
|
||||
ZDOTKERNEL = zdot_ppc440.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
|
||||
ISAMAXKERNEL = iamax_ppc440.S
|
||||
IDAMAXKERNEL = iamax_ppc440.S
|
||||
|
@ -52,8 +62,13 @@ ZNRM2KERNEL = znrm2_ppc440.S
|
|||
|
||||
SROTKERNEL = rot_ppc440.S
|
||||
DROTKERNEL = rot_ppc440.S
|
||||
ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CROTKERNEL = zrot_ppc440.S
|
||||
ZROTKERNEL = zrot_ppc440.S
|
||||
else
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
||||
SSCALKERNEL = scal_ppc440.S
|
||||
DSCALKERNEL = scal_ppc440.S
|
||||
|
@ -116,3 +131,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_ppc440_LN.S
|
|||
ZTRSMKERNEL_LT = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_ppc440_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_ppc440_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
endif
|
||||
|
||||
|
|
|
@ -1,3 +1,14 @@
|
|||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
SGEMMKERNEL = gemm_kernel.S
|
||||
SGEMMINCOPY =
|
||||
SGEMMITCOPY =
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ =
|
||||
SGEMMITCOPYOBJ =
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
SGEMMKERNEL = gemm_kernel_altivec.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
|
@ -7,6 +18,8 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DGEMMKERNEL = gemm_kernel.S
|
||||
DGEMMINCOPY =
|
||||
DGEMMITCOPY =
|
||||
|
@ -16,6 +29,18 @@ DGEMMINCOPYOBJ =
|
|||
DGEMMITCOPYOBJ =
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
CGEMMKERNEL = zgemm_kernel.S
|
||||
CGEMMINCOPY =
|
||||
CGEMMITCOPY =
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
CGEMMKERNEL = zgemm_kernel_altivec.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||
|
@ -25,6 +50,8 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
|||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel.S
|
||||
ZGEMMINCOPY =
|
||||
ZGEMMITCOPY =
|
||||
|
@ -35,22 +62,30 @@ ZGEMMITCOPYOBJ =
|
|||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
#STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
#STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
#STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
DTRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
DTRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
#CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
#CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
#CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
|
||||
STRSMKERNEL_LN = trsm_kernel_LN.S
|
||||
STRSMKERNEL_LT = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RN = trsm_kernel_LT.S
|
||||
STRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LN.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
|
||||
|
||||
SROTKERNEL = ../arm/rot.c
|
||||
DROTKERNEL = ../arm/rot.c
|
||||
CROTKERNEL = ../arm/zrot.c
|
||||
ZROTKERNEL = ../arm/zrot.c
|
||||
endif
|
||||
|
|
|
@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
|
|||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble 2f \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
|
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
|
|||
"xvaddsp 38, 38, %x5 \n\t"
|
||||
"xvaddsp 39, 39, %x6 \n\t"
|
||||
|
||||
"bgt 1b \n"
|
||||
"bgt one%= \n"
|
||||
|
||||
"2: \n\t"
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
|
|
|
@ -24,12 +24,21 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
||||
#ifndef HAVE_ASM_KERNEL
|
||||
#include <altivec.h>
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
#define offset_4 64
|
||||
#define offset_5 80
|
||||
#define offset_6 96
|
||||
#define offset_7 112
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
|
||||
{
|
||||
|
||||
|
@ -43,28 +52,29 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
|
|||
register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
|
||||
#endif
|
||||
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_x = (__vector float *) x;
|
||||
BLASLONG i=0;
|
||||
for (; i < n/2; i += 8) {
|
||||
for(;i<n/2;i+=8){
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vy_4 = vy[i + 4];
|
||||
register __vector float vy_5 = vy[i + 5];
|
||||
register __vector float vy_6 = vy[i + 6];
|
||||
register __vector float vy_7 = vy[i + 7];
|
||||
register __vector float vx_0 = vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
register __vector float vx_4 = vx[i + 4];
|
||||
register __vector float vx_5 = vx[i + 5];
|
||||
register __vector float vx_6 = vx[i + 6];
|
||||
register __vector float vx_7 = vx[i + 7];
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
|
||||
register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
|
||||
register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
|
||||
register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
|
||||
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
|
||||
register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
|
||||
register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
|
||||
register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
|
||||
vy_0 += vx_0*valpha_r;
|
||||
vy_1 += vx_1*valpha_r;
|
||||
vy_2 += vx_2*valpha_r;
|
||||
|
@ -89,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
|
|||
vy_5 += vx_5*valpha_i;
|
||||
vy_6 += vx_6*valpha_i;
|
||||
vy_7 += vx_7*valpha_i;
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vy[i + 2] = vy_2;
|
||||
vy[i + 3] = vy_3;
|
||||
vy[i + 4] = vy_4;
|
||||
vy[i + 5] = vy_5 ;
|
||||
vy[i + 6] = vy_6 ;
|
||||
vy[i + 7] = vy_7 ;
|
||||
vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
|
||||
vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;
|
||||
|
||||
vptr_x+=8;
|
||||
vptr_y+=8;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,590 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#if _CALL_ELF ==2
|
||||
#ifdef CONJ
|
||||
.localentry caxpyc_k,.-caxpyc_k
|
||||
#else
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
#endif
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L41
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L41:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
std 31,-8(1)
|
||||
li 11,0
|
||||
bne 0,.L42
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 3,8,9
|
||||
add 6,8,6
|
||||
subfc 5,5,3
|
||||
add 9,10,9
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 31,31,31
|
||||
addi 6,5,1
|
||||
addi 5,31,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 31,7,4
|
||||
adde 6,5,6
|
||||
subfic 31,0,3
|
||||
subfe 31,31,31
|
||||
xori 6,6,0x1
|
||||
neg 31,31
|
||||
and 6,6,31
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L43
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 31,31,31
|
||||
addi 0,31,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 31,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
addi 0,6,-1
|
||||
addis 31,2,.LC3@toc@ha
|
||||
std 30,-16(1)
|
||||
xscvdpspn 12,1
|
||||
xscvdpspn 11,2
|
||||
srdi. 30,0,2
|
||||
addis 6,2,.LC2@toc@ha
|
||||
addi 6,6,.LC2@toc@l
|
||||
mtctr 30
|
||||
addi 31,31,.LC3@toc@l
|
||||
lxvd2x 42,0,6
|
||||
li 5,16
|
||||
li 6,0
|
||||
lxvd2x 41,0,31
|
||||
xxspltw 12,12,0
|
||||
xxspltw 11,11,0
|
||||
xxpermdi 42,42,42,2
|
||||
xxpermdi 41,41,41,2
|
||||
beq 0,.L44
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvd2x 44,3,6
|
||||
lxvd2x 45,3,5
|
||||
lxvd2x 33,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 32,33,33,2
|
||||
xxpermdi 33,0,0,2
|
||||
vperm 11,13,12,10
|
||||
vperm 13,13,12,9
|
||||
vperm 12,1,0,10
|
||||
vperm 1,1,0,9
|
||||
xvmulsp 0,11,43
|
||||
xvmulsp 32,11,45
|
||||
xvmsubmsp 45,12,0
|
||||
xvmaddasp 32,12,43
|
||||
xvaddsp 44,32,44
|
||||
xvsubsp 32,33,45
|
||||
vmrglw 1,0,12
|
||||
vmrghw 0,0,12
|
||||
#else
|
||||
lxvd2x 45,3,6
|
||||
lxvd2x 33,3,5
|
||||
lxvd2x 43,9,6
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,43,43,2
|
||||
xxpermdi 43,0,0,2
|
||||
vperm 12,1,13,10
|
||||
vperm 1,1,13,9
|
||||
vperm 13,11,0,10
|
||||
vperm 11,11,0,9
|
||||
xvmulsp 0,11,44
|
||||
xvmulsp 32,11,33
|
||||
xvmaddmsp 33,12,0
|
||||
xvmsubasp 32,12,44
|
||||
xvaddsp 45,32,45
|
||||
xvaddsp 32,33,43
|
||||
vmrglw 1,0,13
|
||||
vmrghw 0,0,13
|
||||
#endif
|
||||
xxpermdi 0,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
stxvd2x 0,9,6
|
||||
addi 6,6,32
|
||||
stxvd2x 32,9,5
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 30,-16(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 6,11,2
|
||||
addi 9,4,1
|
||||
addi 5,6,4
|
||||
cmpd 7,7,9
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,2
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
addi 6,4,2
|
||||
addi 5,9,4
|
||||
cmpd 7,7,6
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
addi 6,11,4
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 5,6,4
|
||||
cmpd 7,7,4
|
||||
lfsx 12,8,6
|
||||
lfsx 0,10,6
|
||||
addi 9,11,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L39
|
||||
sldi 9,9,2
|
||||
ld 31,-8(1)
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L43:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
cmpd 7,10,7
|
||||
subf 10,4,7
|
||||
mtctr 10
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
addi 9,9,8
|
||||
addi 3,3,8
|
||||
lfs 12,-8(9)
|
||||
lfs 0,-4(9)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
.L39:
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L42:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 32,1,1,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,2
|
||||
xvcvdpsp 32,32
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 39,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 39,39
|
||||
#else
|
||||
fneg 0,2
|
||||
xxpermdi 39,2,2,0
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
std 28,-32(1)
|
||||
sradi. 28,4,1
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xscvdpspn 5,1
|
||||
xvcvdpsp 39,39
|
||||
lxvd2x 12,0,9
|
||||
xxpermdi 32,0,0,0
|
||||
xxspltw 5,5,0
|
||||
xvcvdpsp 32,32
|
||||
#endif
|
||||
xxpermdi 12,12,12,2
|
||||
vmrgew 7,7,0
|
||||
beq 0,.L5
|
||||
xxlnor 38,12,12
|
||||
std 29,-24(1)
|
||||
std 30,-16(1)
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 29,0
|
||||
li 30,16
|
||||
li 31,32
|
||||
li 12,48
|
||||
li 0,64
|
||||
li 11,80
|
||||
li 3,96
|
||||
li 5,112
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxvd2x 6,0,9
|
||||
lxvd2x 40,0,6
|
||||
addi 29,29,8
|
||||
lxvd2x 41,6,30
|
||||
lxvd2x 42,6,31
|
||||
cmpd 7,28,29
|
||||
lxvd2x 43,6,12
|
||||
lxvd2x 44,6,0
|
||||
lxvd2x 45,6,11
|
||||
lxvd2x 33,6,3
|
||||
lxvd2x 32,6,5
|
||||
lxvd2x 7,9,30
|
||||
addi 6,6,128
|
||||
lxvd2x 8,9,31
|
||||
lxvd2x 9,9,12
|
||||
xxpermdi 40,40,40,2
|
||||
xxpermdi 6,6,6,2
|
||||
lxvd2x 10,9,0
|
||||
lxvd2x 11,9,11
|
||||
xxpermdi 41,41,41,2
|
||||
xxpermdi 42,42,42,2
|
||||
lxvd2x 12,9,3
|
||||
lxvd2x 0,9,5
|
||||
xxpermdi 43,43,43,2
|
||||
xxpermdi 44,44,44,2
|
||||
xxpermdi 45,45,45,2
|
||||
xxpermdi 33,33,33,2
|
||||
xxpermdi 32,32,32,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
xxpermdi 10,10,10,2
|
||||
xxpermdi 11,11,11,2
|
||||
xxpermdi 12,12,12,2
|
||||
xxpermdi 0,0,0,2
|
||||
#ifndef CONJ
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
#endif
|
||||
xvmaddasp 6,39,40
|
||||
xvmaddasp 7,39,41
|
||||
xvmaddasp 8,39,42
|
||||
xvmaddasp 9,39,43
|
||||
xvmaddasp 10,39,44
|
||||
xvmaddasp 11,39,45
|
||||
xvmaddasp 12,39,33
|
||||
xvmaddasp 0,39,32
|
||||
#ifdef CONJ
|
||||
vperm 8,8,8,6
|
||||
vperm 9,9,9,6
|
||||
vperm 10,10,10,6
|
||||
vperm 11,11,11,6
|
||||
vperm 12,12,12,6
|
||||
vperm 13,13,13,6
|
||||
vperm 1,1,1,6
|
||||
vperm 0,0,0,6
|
||||
xvmaddasp 6,5,40
|
||||
xvmaddasp 7,5,41
|
||||
xvmaddasp 8,5,42
|
||||
xvmaddasp 9,5,43
|
||||
xvmaddasp 10,5,44
|
||||
xvmaddasp 11,5,45
|
||||
xvmaddasp 12,5,33
|
||||
xvmaddasp 0,5,32
|
||||
#endif
|
||||
xxpermdi 6,6,6,2
|
||||
xxpermdi 7,7,7,2
|
||||
xxpermdi 8,8,8,2
|
||||
xxpermdi 9,9,9,2
|
||||
stxvd2x 6,0,9
|
||||
xxpermdi 10,10,10,2
|
||||
stxvd2x 7,9,30
|
||||
xxpermdi 11,11,11,2
|
||||
stxvd2x 8,9,31
|
||||
xxpermdi 12,12,12,2
|
||||
stxvd2x 9,9,12
|
||||
xxpermdi 0,0,0,2
|
||||
stxvd2x 10,9,0
|
||||
stxvd2x 11,9,11
|
||||
stxvd2x 12,9,3
|
||||
stxvd2x 0,9,5
|
||||
addi 9,9,128
|
||||
bgt 7,.L6
|
||||
ld 29,-24(1)
|
||||
ld 30,-16(1)
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L36
|
||||
sldi 11,4,1
|
||||
ld 28,-32(1)
|
||||
b .L4
|
||||
.L36:
|
||||
ld 28,-32(1)
|
||||
ld 31,-8(1)
|
||||
b .L33
|
||||
.L44:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,4,0,0
|
||||
#if _CALL_ELF ==2
|
||||
#ifdef CONJ
|
||||
.size caxpyc_k,.-caxpyc_k
|
||||
#else
|
||||
.size caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
#endif
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
|
@ -0,0 +1,546 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
/*
|
||||
.file "caxpy.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl caxpy_k
|
||||
.type caxpy_k, @function
|
||||
*/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
caxpy_k:
|
||||
.LCF0:
|
||||
0: addis 2,12,.TOC.-.LCF0@ha
|
||||
addi 2,2,.TOC.-.LCF0@l
|
||||
#ifdef CONJ
|
||||
.localentry caxpyc_k,.-caxpyc_k
|
||||
#else
|
||||
.localentry caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
mr. 7,3
|
||||
ble 0,.L33
|
||||
cmpdi 7,9,1
|
||||
beq 7,.L37
|
||||
.L3:
|
||||
mtctr 7
|
||||
ld 7,96(1)
|
||||
sldi 9,9,3
|
||||
sldi 7,7,3
|
||||
.p2align 4,,15
|
||||
.L14:
|
||||
lfs 10,4(8)
|
||||
lfs 11,0(8)
|
||||
lfs 12,0(10)
|
||||
lfs 0,4(10)
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,11,1,10
|
||||
#else
|
||||
fmsubs 11,11,1,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,0(10)
|
||||
lfs 11,0(8)
|
||||
lfs 12,4(8)
|
||||
add 8,8,9
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,4(10)
|
||||
add 10,10,7
|
||||
bdnz .L14
|
||||
.L33:
|
||||
li 3,0
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L37:
|
||||
ld 6,96(1)
|
||||
cmpdi 7,6,1
|
||||
bne 7,.L3
|
||||
rldicr. 4,7,0,59
|
||||
li 11,0
|
||||
bne 0,.L38
|
||||
.L4:
|
||||
addi 6,11,8
|
||||
subf 0,4,7
|
||||
sldi 6,6,2
|
||||
addi 9,6,-32
|
||||
add 5,10,6
|
||||
add 6,8,6
|
||||
add 3,8,9
|
||||
add 9,10,9
|
||||
subfc 5,5,3
|
||||
subfe 5,5,5
|
||||
subfc 6,6,9
|
||||
subfe 12,12,12
|
||||
addi 6,5,1
|
||||
addi 5,12,1
|
||||
or 6,6,5
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
sradi 6,4,63
|
||||
srdi 5,7,63
|
||||
subfc 12,7,4
|
||||
adde 6,5,6
|
||||
subfic 12,0,4
|
||||
subfe 12,12,12
|
||||
xori 6,6,0x1
|
||||
neg 12,12
|
||||
and 6,6,12
|
||||
rlwinm 6,6,0,0xff
|
||||
cmpwi 7,6,0
|
||||
beq 7,.L7
|
||||
cmpd 7,4,7
|
||||
li 6,1
|
||||
blt 7,.L39
|
||||
.L9:
|
||||
addi 0,7,-1
|
||||
subf 0,4,0
|
||||
subfic 0,0,3
|
||||
subfe 12,12,12
|
||||
addi 0,12,1
|
||||
rlwinm 0,0,0,0xff
|
||||
cmpwi 7,0,0
|
||||
bne 7,.L10
|
||||
sradi 0,4,63
|
||||
subfc 12,7,4
|
||||
adde 5,5,0
|
||||
rlwinm 5,5,0,0xff
|
||||
cmpwi 7,5,0
|
||||
bne 7,.L10
|
||||
xscvdpspn 0,1
|
||||
xscvdpspn 12,2
|
||||
addi 0,6,-1
|
||||
std 31,-8(1)
|
||||
addis 12,2,.LC2@toc@ha
|
||||
addis 6,2,.LC3@toc@ha
|
||||
li 5,16
|
||||
srdi. 31,0,2
|
||||
addi 6,6,.LC3@toc@l
|
||||
addi 12,12,.LC2@toc@l
|
||||
mtctr 31
|
||||
lxv 41,0(6)
|
||||
lxv 42,0(12)
|
||||
li 6,0
|
||||
xxspltw 0,0,0
|
||||
xxspltw 12,12,0
|
||||
beq 0,.L40
|
||||
.p2align 4,,15
|
||||
.L11:
|
||||
#ifdef CONJ
|
||||
lxvx 33,3,5
|
||||
lxvx 44,3,6
|
||||
lxvx 43,9,6
|
||||
lxvx 32,9,5
|
||||
vperm 13,1,12,10
|
||||
vperm 12,1,12,9
|
||||
vperm 8,0,11,10
|
||||
vperm 0,0,11,9
|
||||
xvmulsp 33,12,44
|
||||
xvmulsp 11,12,45
|
||||
xvmaddasp 33,0,45
|
||||
xvmsubmsp 44,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvsubsp 32,32,44
|
||||
#else
|
||||
lxvx 33,3,6
|
||||
lxvx 32,3,5
|
||||
lxvx 43,9,6
|
||||
lxvx 44,9,5
|
||||
vperm 13,0,1,10
|
||||
vperm 0,0,1,9
|
||||
vperm 8,12,11,10
|
||||
vperm 12,12,11,9
|
||||
xvmulsp 33,12,32
|
||||
xvmulsp 11,12,45
|
||||
xvmsubasp 33,0,45
|
||||
xvmaddmsp 32,0,11
|
||||
xvaddsp 33,33,40
|
||||
xvaddsp 32,32,44
|
||||
#endif
|
||||
vmrglw 13,0,1
|
||||
vmrghw 0,0,1
|
||||
stxvx 45,9,6
|
||||
stxvx 32,9,5
|
||||
addi 6,6,32
|
||||
addi 5,5,32
|
||||
bdnz .L11
|
||||
rldicr 0,0,0,61
|
||||
ld 31,-8(1)
|
||||
sldi 9,0,1
|
||||
add 4,4,0
|
||||
add 11,11,9
|
||||
.L10:
|
||||
sldi 5,11,2
|
||||
addi 6,4,1
|
||||
addi 9,11,2
|
||||
addi 3,5,4
|
||||
lfsx 12,8,5
|
||||
cmpd 7,7,6
|
||||
lfsx 0,10,5
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,12,1,11
|
||||
#else
|
||||
fmsubs 12,12,1,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,5
|
||||
lfsx 11,8,5
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,12,1,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,12,1,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 5,4,2
|
||||
addi 6,11,4
|
||||
addi 3,9,4
|
||||
lfsx 12,8,9
|
||||
cmpd 7,7,5
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,3
|
||||
lfsx 0,10,3
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,3
|
||||
ble 7,.L33
|
||||
sldi 6,6,2
|
||||
addi 4,4,3
|
||||
addi 9,11,6
|
||||
addi 5,6,4
|
||||
lfsx 12,8,6
|
||||
cmpd 7,7,4
|
||||
lfsx 0,10,6
|
||||
lfsx 11,8,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,6
|
||||
lfsx 11,8,6
|
||||
lfsx 12,8,5
|
||||
lfsx 0,10,5
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfsx 0,10,5
|
||||
ble 7,.L33
|
||||
sldi 9,9,2
|
||||
addi 7,9,4
|
||||
lfsx 12,8,9
|
||||
lfsx 0,10,9
|
||||
lfsx 11,8,7
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmadds 12,1,12,11
|
||||
#else
|
||||
fmsubs 12,1,12,11
|
||||
#endif
|
||||
fadds 0,0,12
|
||||
stfsx 0,10,9
|
||||
lfsx 11,8,9
|
||||
lfsx 12,8,7
|
||||
lfsx 0,10,7
|
||||
fmuls 2,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 1,1,12,2
|
||||
fsubs 1,0,1
|
||||
#else
|
||||
fmadds 1,1,12,2
|
||||
fadds 1,0,1
|
||||
#endif
|
||||
stfsx 1,10,7
|
||||
b .L33
|
||||
.L39:
|
||||
mr 6,0
|
||||
b .L9
|
||||
.L38:
|
||||
#ifdef CONJ
|
||||
fneg 0,1
|
||||
xxpermdi 45,1,1,0
|
||||
xscvdpspn 12,2
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 44,0,0,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 45,45
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 32,44
|
||||
xxspltw 12,12,0
|
||||
#else
|
||||
fneg 12,2
|
||||
xxpermdi 32,2,2,0
|
||||
xscvdpspn 0,1
|
||||
addis 9,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,4,1
|
||||
xxpermdi 45,12,12,0
|
||||
addi 9,9,.LANCHOR0@toc@l
|
||||
xvcvdpsp 32,32
|
||||
lxv 33,0(9)
|
||||
xvcvdpsp 45,45
|
||||
xxspltw 0,0,0
|
||||
#endif
|
||||
vmrgew 0,0,13
|
||||
beq 0,.L5
|
||||
mr 6,8
|
||||
mr 9,10
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 38,16(6)
|
||||
lxv 11,16(9)
|
||||
addi 5,5,8
|
||||
addi 6,6,128
|
||||
addi 9,9,128
|
||||
lxv 39,-96(6)
|
||||
lxv 40,-80(6)
|
||||
lxv 41,-64(6)
|
||||
lxv 42,-48(6)
|
||||
cmpd 7,3,5
|
||||
lxv 43,-32(6)
|
||||
lxv 45,-128(6)
|
||||
lxv 44,-16(6)
|
||||
#ifdef CONJ
|
||||
lxv 0,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,32,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 0,32,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,32,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,12,49
|
||||
xvmaddmsp 40,32,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,12,0
|
||||
xvmaddasp 39,12,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,32,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,12,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,32,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,12,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,32,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,12,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,32,11
|
||||
xvmaddasp 43,12,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,12,37
|
||||
#else
|
||||
lxv 12,-128(9)
|
||||
vpermr 17,6,6,1
|
||||
xvmaddmsp 38,0,11
|
||||
lxv 11,-96(9)
|
||||
vpermr 18,7,7,1
|
||||
vpermr 19,8,8,1
|
||||
vpermr 2,9,9,1
|
||||
vpermr 3,10,10,1
|
||||
vpermr 4,11,11,1
|
||||
xvmaddasp 12,0,45
|
||||
vpermr 5,12,12,1
|
||||
xvmaddmsp 39,0,11
|
||||
lxv 11,-80(9)
|
||||
vpermr 13,13,13,1
|
||||
xvmaddasp 38,32,49
|
||||
xvmaddmsp 40,0,11
|
||||
lxv 11,-64(9)
|
||||
xvmaddmsp 45,32,12
|
||||
xvmaddasp 39,32,50
|
||||
stxv 38,-112(9)
|
||||
xvmaddmsp 41,0,11
|
||||
lxv 11,-48(9)
|
||||
xvmaddasp 40,32,51
|
||||
stxv 45,-128(9)
|
||||
stxv 39,-96(9)
|
||||
xvmaddmsp 42,0,11
|
||||
lxv 11,-32(9)
|
||||
xvmaddasp 41,32,34
|
||||
stxv 40,-80(9)
|
||||
xvmaddmsp 43,0,11
|
||||
lxv 11,-16(9)
|
||||
xvmaddasp 42,32,35
|
||||
stxv 41,-64(9)
|
||||
xvmaddmsp 44,0,11
|
||||
xvmaddasp 43,32,36
|
||||
stxv 42,-48(9)
|
||||
xvmaddasp 44,32,37
|
||||
#endif
|
||||
stxv 43,-32(9)
|
||||
stxv 44,-16(9)
|
||||
bgt 7,.L6
|
||||
.L5:
|
||||
cmpd 7,7,4
|
||||
ble 7,.L33
|
||||
sldi 11,4,1
|
||||
b .L4
|
||||
.L7:
|
||||
addi 10,4,1
|
||||
subf 8,4,7
|
||||
cmpd 7,10,7
|
||||
mtctr 8
|
||||
bgt 7,.L26
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,7,10
|
||||
beq 7,.L26
|
||||
.p2align 4,,15
|
||||
.L13:
|
||||
lfs 10,4(3)
|
||||
lfs 11,0(3)
|
||||
lfs 12,0(9)
|
||||
lfs 0,4(9)
|
||||
addi 3,3,8
|
||||
addi 9,9,8
|
||||
fmuls 10,2,10
|
||||
#ifdef CONJ
|
||||
fmadds 11,1,11,10
|
||||
#else
|
||||
fmsubs 11,1,11,10
|
||||
#endif
|
||||
fadds 12,12,11
|
||||
stfs 12,-8(9)
|
||||
lfs 11,-8(3)
|
||||
lfs 12,-4(3)
|
||||
fmuls 11,2,11
|
||||
#ifdef CONJ
|
||||
fmsubs 12,1,12,11
|
||||
fsubs 0,0,12
|
||||
#else
|
||||
fmadds 12,1,12,11
|
||||
fadds 0,0,12
|
||||
#endif
|
||||
stfs 0,-4(9)
|
||||
bdnz .L13
|
||||
b .L33
|
||||
.L40:
|
||||
li 31,1
|
||||
mtctr 31
|
||||
b .L11
|
||||
.L26:
|
||||
li 10,1
|
||||
mtctr 10
|
||||
b .L13
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,1,0,0
|
||||
#ifdef CONJ
|
||||
.size caxpyc_k,.-caxpyc_k
|
||||
#else
|
||||
.size caxpy_k,.-caxpy_k
|
||||
#endif
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.section .rodata.cst16,"aM",@progbits,16
|
||||
.align 4
|
||||
.LC2:
|
||||
.byte 31
|
||||
.byte 30
|
||||
.byte 29
|
||||
.byte 28
|
||||
.byte 23
|
||||
.byte 22
|
||||
.byte 21
|
||||
.byte 20
|
||||
.byte 15
|
||||
.byte 14
|
||||
.byte 13
|
||||
.byte 12
|
||||
.byte 7
|
||||
.byte 6
|
||||
.byte 5
|
||||
.byte 4
|
||||
.LC3:
|
||||
.byte 27
|
||||
.byte 26
|
||||
.byte 25
|
||||
.byte 24
|
||||
.byte 19
|
||||
.byte 18
|
||||
.byte 17
|
||||
.byte 16
|
||||
.byte 11
|
||||
.byte 10
|
||||
.byte 9
|
||||
.byte 8
|
||||
.byte 3
|
||||
.byte 2
|
||||
.byte 1
|
||||
.byte 0
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.gnu_attribute 4, 1
|
||||
.section .note.GNU-stack,"",@progbits
|
|
@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
|||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble 2f \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".p2align 5 \n"
|
||||
"1: \n\t"
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
|
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
|
|||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt 1b \n"
|
||||
"bgt one%= \n"
|
||||
|
||||
"2: \n\t"
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxvd2x 32, 0, %3 \n\t"
|
||||
"stxvd2x 33, %5, %3 \n\t"
|
||||
|
|
|
@ -25,15 +25,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
|
||||
#define offset_0 0
|
||||
#define offset_1 16
|
||||
#define offset_2 32
|
||||
#define offset_3 48
|
||||
|
||||
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
||||
{
|
||||
__vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vx = (__vector float *) x;
|
||||
BLASLONG i = 0;
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_x = (__vector float *) x;
|
||||
register __vector float vd_0 = { 0 };
|
||||
register __vector float vd_1 = { 0 };
|
||||
register __vector float vd_2 = { 0 };
|
||||
|
@ -41,26 +48,23 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
|||
register __vector float vdd_0 = { 0 };
|
||||
register __vector float vdd_1 = { 0 };
|
||||
register __vector float vdd_2 = { 0 };
|
||||
register __vector float vdd_3 = { 0 };
|
||||
for (; i < n/2; i += 4) {
|
||||
register __vector float vdd_3 = { 0 };
|
||||
BLASLONG i=0;
|
||||
for(;i<n/2;i+=4){
|
||||
|
||||
register __vector float vyy_0 ;
|
||||
register __vector float vyy_1 ;
|
||||
register __vector float vyy_2 ;
|
||||
register __vector float vyy_3 ;
|
||||
register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
|
||||
register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
|
||||
register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
|
||||
register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
|
||||
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float vy_2 = vy[i + 2];
|
||||
register __vector float vy_3 = vy[i + 3];
|
||||
register __vector float vx_0= vx[i];
|
||||
register __vector float vx_1 = vx[i + 1];
|
||||
register __vector float vx_2 = vx[i + 2];
|
||||
register __vector float vx_3 = vx[i + 3];
|
||||
vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
|
||||
vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
|
||||
vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
|
||||
vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
|
||||
register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
|
||||
register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
|
||||
register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
|
||||
register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
|
||||
register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
|
||||
register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
|
||||
register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
|
||||
register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);
|
||||
|
||||
vd_0 += vx_0 * vy_0;
|
||||
vd_1 += vx_1 * vy_1;
|
||||
|
@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
|||
vdd_2 += vx_2 * vyy_2;
|
||||
vdd_3 += vx_3 * vyy_3;
|
||||
|
||||
vptr_x+=4;
|
||||
vptr_y+=4;
|
||||
|
||||
}
|
||||
//aggregate
|
||||
|
@ -96,7 +102,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
BLASLONG i = 0;
|
||||
BLASLONG ix=0, iy=0;
|
||||
OPENBLAS_COMPLEX_FLOAT result;
|
||||
FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
|
||||
|
||||
if (n <= 0) {
|
||||
CREAL(result) = 0.0;
|
||||
|
|
|
@ -0,0 +1,245 @@
|
|||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
/*
|
||||
.file "cdot.c"
|
||||
.abiversion 2
|
||||
.section ".text"
|
||||
.align 2
|
||||
.p2align 4,,15
|
||||
.globl cdot_k
|
||||
.type cdot_k, @function
|
||||
*/
|
||||
PROLOGUE
|
||||
|
||||
cdot_k:
|
||||
.LCF0:
|
||||
0: mr. 9,3
|
||||
ble 0,.L10
|
||||
cmpdi 7,5,1
|
||||
beq 7,.L18
|
||||
.L3:
|
||||
mtctr 9
|
||||
xxlxor 2,2,2
|
||||
sldi 5,5,3
|
||||
sldi 7,7,3
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.p2align 4,,15
|
||||
.L9:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
add 6,6,7
|
||||
add 4,4,5
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L9
|
||||
.L7:
|
||||
#ifdef CONJ
|
||||
fsubs 2,12,2
|
||||
fadds 1,1,8
|
||||
#else
|
||||
fadds 2,2,8
|
||||
fsubs 1,1,9
|
||||
#endif
|
||||
blr
|
||||
.p2align 4,,15
|
||||
.L18:
|
||||
cmpdi 7,7,1
|
||||
bne 7,.L3
|
||||
rldicr. 10,9,0,60
|
||||
bne 0,.L19
|
||||
xxlxor 2,2,2
|
||||
li 8,0
|
||||
#ifdef CONJ
|
||||
fmr 12,2
|
||||
#endif
|
||||
fmr 8,2
|
||||
#ifndef CONJ
|
||||
fmr 9,2
|
||||
#endif
|
||||
fmr 1,2
|
||||
.L4:
|
||||
addi 7,10,1
|
||||
sldi 8,8,2
|
||||
subf 10,10,9
|
||||
cmpd 7,7,9
|
||||
mtctr 10
|
||||
add 4,4,8
|
||||
add 6,6,8
|
||||
bgt 7,.L16
|
||||
li 10,-1
|
||||
rldicr 10,10,0,0
|
||||
cmpd 7,9,10
|
||||
beq 7,.L16
|
||||
.p2align 4,,15
|
||||
.L8:
|
||||
#ifdef CONJ
|
||||
lfs 9,0(4)
|
||||
lfs 11,0(6)
|
||||
lfs 10,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,9,11,1
|
||||
fmadds 12,9,10,12
|
||||
fmadds 8,0,10,8
|
||||
fmadds 2,11,0,2
|
||||
#else
|
||||
lfs 10,0(4)
|
||||
lfs 12,0(6)
|
||||
lfs 11,4(6)
|
||||
lfs 0,4(4)
|
||||
addi 6,6,8
|
||||
addi 4,4,8
|
||||
fmadds 1,10,12,1
|
||||
fmadds 8,10,11,8
|
||||
fmadds 9,0,11,9
|
||||
fmadds 2,12,0,2
|
||||
#endif
|
||||
bdnz .L8
|
||||
b .L7
|
||||
.p2align 4,,15
|
||||
.L10:
|
||||
xxlxor 1,1,1
|
||||
fmr 2,1
|
||||
blr
|
||||
.L19:
|
||||
addis 8,2,.LANCHOR0@toc@ha
|
||||
sradi. 3,10,1
|
||||
xxspltib 42,0
|
||||
addi 8,8,.LANCHOR0@toc@l
|
||||
lxv 32,0(8)
|
||||
beq 0,.L12
|
||||
xxlor 6,42,42
|
||||
xxlor 4,42,42
|
||||
xxlor 0,42,42
|
||||
xxlor 7,42,42
|
||||
xxlor 5,42,42
|
||||
xxlor 3,42,42
|
||||
xxlor 12,42,42
|
||||
mr 7,4
|
||||
mr 8,6
|
||||
li 5,0
|
||||
.p2align 4,,15
|
||||
.L6:
|
||||
lxv 43,0(8)
|
||||
lxv 44,16(8)
|
||||
addi 5,5,4
|
||||
addi 8,8,64
|
||||
addi 7,7,64
|
||||
lxv 45,-32(8)
|
||||
lxv 33,-16(8)
|
||||
lxv 8,-64(7)
|
||||
lxv 9,-48(7)
|
||||
cmpd 7,3,5
|
||||
lxv 10,-32(7)
|
||||
lxv 11,-16(7)
|
||||
vpermr 6,11,11,0
|
||||
vpermr 7,12,12,0
|
||||
vpermr 8,13,13,0
|
||||
vpermr 9,1,1,0
|
||||
xvmaddasp 12,43,8
|
||||
xvmaddasp 3,44,9
|
||||
xvmaddasp 0,8,38
|
||||
xvmaddasp 4,9,39
|
||||
xvmaddasp 6,10,40
|
||||
xvmaddasp 5,45,10
|
||||
xvmaddasp 42,11,41
|
||||
xvmaddasp 7,33,11
|
||||
bgt 7,.L6
|
||||
xvaddsp 12,12,3
|
||||
xvaddsp 0,0,4
|
||||
xvaddsp 12,12,5
|
||||
xvaddsp 0,0,6
|
||||
xvaddsp 12,12,7
|
||||
xvaddsp 42,0,42
|
||||
.L5:
|
||||
#ifdef CONJ
|
||||
xxpermdi 8,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 8,8,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,8,8,3
|
||||
xxsldwi 12,0,0,3
|
||||
xxsldwi 8,8,8,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 12,12
|
||||
xscvspdp 8,8
|
||||
#else
|
||||
xxpermdi 9,12,12,2
|
||||
xxpermdi 0,42,42,2
|
||||
cmpd 7,9,10
|
||||
sldi 8,10,1
|
||||
xvaddsp 9,9,12
|
||||
xvaddsp 0,0,42
|
||||
xxsldwi 1,9,9,3
|
||||
xxsldwi 2,0,0,3
|
||||
xxsldwi 9,9,9,2
|
||||
xxsldwi 0,0,0,2
|
||||
xscvspdp 8,2
|
||||
xscvspdp 1,1
|
||||
xscvspdp 9,9
|
||||
#endif
|
||||
xscvspdp 2,0
|
||||
bgt 7,.L4
|
||||
b .L7
|
||||
.L12:
|
||||
xxlor 12,42,42
|
||||
b .L5
|
||||
.L16:
|
||||
li 9,1
|
||||
mtctr 9
|
||||
b .L8
|
||||
.long 0
|
||||
.byte 0,0,0,0,0,0,0,0
|
||||
.size cdot_k,.-cdot_k
|
||||
.section .rodata
|
||||
.align 4
|
||||
.set .LANCHOR0,. + 0
|
||||
.type swap_mask_arr, @object
|
||||
.size swap_mask_arr, 16
|
||||
swap_mask_arr:
|
||||
.byte 4
|
||||
.byte 5
|
||||
.byte 6
|
||||
.byte 7
|
||||
.byte 0
|
||||
.byte 1
|
||||
.byte 2
|
||||
.byte 3
|
||||
.byte 12
|
||||
.byte 13
|
||||
.byte 14
|
||||
.byte 15
|
||||
.byte 8
|
||||
.byte 9
|
||||
.byte 10
|
||||
.byte 11
|
||||
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
||||
.section .note.GNU-stack,"",@progbits
|
File diff suppressed because it is too large
Load Diff
|
@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
* Macros for N=4 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_4x8', `
|
||||
#else
|
||||
.macro COPY_4x8
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs46, o32, T1
|
||||
stxvw4x vs47, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_4x4', `
|
||||
#else
|
||||
.macro COPY_4x4
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs38, o32, T1
|
||||
stxvw4x vs39, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_4x2', `
|
||||
#else
|
||||
.macro COPY_4x2
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
stxvw4x vs35, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=4 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_4x1', `
|
||||
#else
|
||||
.macro COPY_4x1
|
||||
#endif
|
||||
|
||||
lxsspx vs32, o0, A0
|
||||
lxsspx vs33, o4, A0
|
||||
|
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxsspx vs38, o0, T1
|
||||
stxsspx vs39, o4, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_2x8', `
|
||||
#else
|
||||
.macro COPY_2x8
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs38, o32, T1
|
||||
stxvw4x vs39, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_2x4', `
|
||||
#else
|
||||
.macro COPY_2x4
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs34, o32, T1
|
||||
stxvw4x vs35, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_2x2', `
|
||||
#else
|
||||
.macro COPY_2x2
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
stxvw4x vs33, o16, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=2 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_2x1', `
|
||||
#else
|
||||
.macro COPY_2x1
|
||||
#endif
|
||||
|
||||
lxsspx vs32, o0, A0
|
||||
lxsspx vs33, o4, A0
|
||||
|
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxsspx vs34, o0, T1
|
||||
stxsspx vs35, o4, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=8
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_1x8', `
|
||||
#else
|
||||
.macro COPY_1x8
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs34, o32, T1
|
||||
stxvw4x vs35, o48, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=4
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_1x4', `
|
||||
#else
|
||||
.macro COPY_1x4
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
lxvw4x vs33, o16, A0
|
||||
|
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxvw4x vs32, o0, T1
|
||||
stxvw4x vs33, o16, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=2
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_1x2', `
|
||||
#else
|
||||
.macro COPY_1x2
|
||||
#endif
|
||||
|
||||
lxvw4x vs32, o0, A0
|
||||
addi A0, A0, 16
|
||||
|
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
stxvw4x vs32, o0, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
/**********************************************************************************************
|
||||
* Macros for N=1 and M=1
|
||||
**********************************************************************************************/
|
||||
|
||||
#if defined(_AIX)
|
||||
define(`COPY_1x1', `
|
||||
#else
|
||||
.macro COPY_1x1
|
||||
#endif
|
||||
|
||||
lxsspx vs32, o0, A0
|
||||
lxsspx vs33, o4, A0
|
||||
|
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stxsspx vs32, o0, T1
|
||||
stxsspx vs33, o4, T1
|
||||
|
||||
#if defined(_AIX)
|
||||
')
|
||||
#else
|
||||
.endm
|
||||
#endif
|
||||
|
||||
|
|
|
@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define NBMAX 1024
|
||||
|
||||
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
||||
|
@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
|
||||
register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
register __vector float *vptr_a2 = (__vector float *) a2;
|
||||
register __vector float *vptr_a3 = (__vector float *) a3;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va2 = vptr_a2[i];
|
||||
register __vector float va3 = vptr_a3[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
register __vector float va2_1 = vptr_a2[i + 1];
|
||||
register __vector float va3_1 = vptr_a3[i + 1];
|
||||
BLASLONG i2=16;
|
||||
for (;i< n * 8; i+=32,i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
|
||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
|
||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
|
||||
|
||||
vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
|
||||
|
@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
|
||||
vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -118,17 +119,19 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
|
||||
register __vector float vx1_i = {x[3], x[3],x[3], x[3]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) a0;
|
||||
register __vector float *vptr_a1 = (__vector float *) a1;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va1 = vptr_a1[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
register __vector float va1_1 = vptr_a1[i + 1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
register __vector float va1x = vec_perm(va1, va1,swap_mask);
|
||||
|
@ -136,8 +139,8 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
|
||||
vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
|||
register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
|
||||
register __vector float vx0_i = {x[1], x[1],x[1], x[1]};
|
||||
#endif
|
||||
register __vector float *vy = (__vector float *) y;
|
||||
register __vector float *vptr_y = (__vector float *) y;
|
||||
register __vector float *vptr_a0 = (__vector float *) ap;
|
||||
BLASLONG i = 0;
|
||||
for (;i< n / 2; i+=2) {
|
||||
register __vector float vy_0 = vy[i];
|
||||
register __vector float vy_1 = vy[i + 1];
|
||||
register __vector float va0 = vptr_a0[i];
|
||||
register __vector float va0_1 = vptr_a0[i + 1];
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
|
||||
register __vector float va0x = vec_perm(va0, va0,swap_mask);
|
||||
register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
|
||||
vy_0 += va0*vx0_r + va0x*vx0_i;
|
||||
vy_1 += va0_1*vx0_r + va0x_1*vx0_i;
|
||||
|
||||
vy[i] = vy_0;
|
||||
vy[i + 1] = vy_1;
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -176,7 +181,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
|
|||
|
||||
|
||||
static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
BLASLONG i=0;
|
||||
|
||||
|
||||
if (inc_dest != 2) {
|
||||
|
@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
|||
|
||||
register __vector float *vptr_src = (__vector float *) src;
|
||||
register __vector float *vptr_y = (__vector float *) dest;
|
||||
for (i = 0; i < n/2; i += 2 ){
|
||||
|
||||
register __vector float vy_0 = vptr_y[i];
|
||||
register __vector float vy_1 = vptr_y[i +1];
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vy_0 = vec_vsx_ld(i,vptr_y);
|
||||
register __vector float vy_1 = vec_vsx_ld(i2,vptr_y);
|
||||
|
||||
register __vector float vsrc = vptr_src[i];
|
||||
register __vector float vsrc_1 = vptr_src[i + 1];
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
vptr_y[i] = vy_0;
|
||||
vptr_y[i+1 ] = vy_1;
|
||||
register __vector float vsrc = vec_vsx_ld(i,vptr_src);
|
||||
register __vector float vsrc_1 = vec_vsx_ld(i2,vptr_src);
|
||||
|
||||
register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
|
||||
register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
|
||||
|
||||
vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
|
||||
vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i;
|
||||
|
||||
vec_vsx_st(vy_0 ,i, vptr_y);
|
||||
vec_vsx_st(vy_1,i2,vptr_y);
|
||||
|
||||
}
|
||||
|
||||
|
@ -237,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
|
|||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG i=0;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
@ -247,8 +256,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
|||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT xbuffer[8], *ybuffer;
|
||||
FLOAT xbuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *ybuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
|
|
@ -29,10 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define NBMAX 1024
|
||||
#include <altivec.h>
|
||||
static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
|
||||
|
||||
static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
FLOAT *a0, *a1, *a2, *a3;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
|
@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
__vector float* va2 = (__vector float*) a2;
|
||||
__vector float* va3 = (__vector float*) a3;
|
||||
__vector float* vptr_a0 = (__vector float*) a0;
|
||||
__vector float* vptr_a1 = (__vector float*) a1;
|
||||
__vector float* vptr_a2 = (__vector float*) a2;
|
||||
__vector float* vptr_a3 = (__vector float*) a3;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
|
||||
vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1];
|
||||
vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
|
||||
vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va2 = vec_vsx_ld(i ,vptr_a2);
|
||||
register __vector float va3 = vec_vsx_ld(i ,vptr_a3);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
|
||||
register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
|
||||
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
vtemp1_p += vx_0*va1 + vx_1*va1_1;
|
||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
|
||||
vtemp2_p += vx_0*va2 + vx_1*va2_1;
|
||||
vtemp2_r += vxr_0*va2 + vxr_1*va2_1;
|
||||
vtemp3_p += vx_0*va3 + vx_1*va3_1;
|
||||
vtemp3_r += vxr_0*va3 + vxr_1*va3_1;
|
||||
|
||||
}
|
||||
|
||||
|
@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
|
||||
|
||||
static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
FLOAT *a0, *a1;
|
||||
a0 = ap;
|
||||
a1 = ap + lda;
|
||||
|
@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) a0;
|
||||
__vector float* va1 = (__vector float*) a1;
|
||||
|
||||
|
||||
__vector float* vptr_a0 = (__vector float*) a0;
|
||||
__vector float* vptr_a1 = (__vector float*) a1;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
|
||||
vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va1 = vec_vsx_ld(i, vptr_a1);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
|
||||
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
vtemp1_p += vx_0*va1 + vx_1*va1_1;
|
||||
vtemp1_r += vxr_0*va1 + vxr_1*va1_1;
|
||||
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
||||
register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
|
||||
|
@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
|
|||
|
||||
|
||||
static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
|
||||
BLASLONG i;
|
||||
|
||||
__vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
|
||||
//p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
|
||||
register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
|
||||
register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
|
||||
__vector float* va0 = (__vector float*) ap;
|
||||
__vector float* vptr_a0 = (__vector float*) ap;
|
||||
__vector float* v_x = (__vector float*) x;
|
||||
|
||||
for (i = 0; i < n / 2; i+=2) {
|
||||
register __vector float vx_0 = v_x[i];
|
||||
register __vector float vx_1 = v_x[i+1];
|
||||
BLASLONG i = 0;
|
||||
BLASLONG i2 = 16;
|
||||
for (;i< n * 8; i+=32, i2+=32) {
|
||||
register __vector float vx_0 = vec_vsx_ld( i,v_x) ;
|
||||
register __vector float vx_1 = vec_vsx_ld(i2, v_x);
|
||||
|
||||
register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
|
||||
register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
|
||||
|
||||
vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
|
||||
vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];
|
||||
register __vector float va0 = vec_vsx_ld(i,vptr_a0);
|
||||
register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
|
||||
|
||||
vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
|
||||
vtemp0_r += vxr_0*va0 + vxr_1*va0_1;
|
||||
}
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
|
@ -249,8 +276,8 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
|
|||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
|
||||
BLASLONG i;
|
||||
BLASLONG j;
|
||||
BLASLONG i=0;
|
||||
BLASLONG j=0;
|
||||
FLOAT *a_ptr;
|
||||
FLOAT *x_ptr;
|
||||
FLOAT *y_ptr;
|
||||
|
@ -260,8 +287,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
|||
BLASLONG m2;
|
||||
BLASLONG m3;
|
||||
BLASLONG n2;
|
||||
|
||||
FLOAT ybuffer[8], *xbuffer;
|
||||
FLOAT ybuffer[8] __attribute__((aligned(16)));
|
||||
FLOAT *xbuffer;
|
||||
|
||||
if (m < 1) return (0);
|
||||
if (n < 1) return (0);
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue