Merge pull request #3150 from xianyi/develop

Update branch from develop for 0.3.14 release
This commit is contained in:
Martin Kroeker 2021-03-17 20:21:42 +01:00 committed by GitHub
commit 2f6d35c3d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
102 changed files with 4895 additions and 433 deletions

View File

@ -190,3 +190,27 @@ steps:
- make -C ctest $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
- make -C cpp_thread_test dgemm_tester
---
kind: pipeline
name: arm64_gcc10
platform:
os: linux
arch: arm64
steps:
- name: Build and Test
image: ubuntu:20.04
environment:
CC: gcc-10
FC: gfortran-10
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
commands:
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
- apt-get update -y
- apt-get install -y make $CC gfortran-10 perl python g++
- $CC --version
- make QUIET_MAKE=1 $COMMON_FLAGS
- make -C utest $COMMON_FLAGS
- make -C test $COMMON_FLAGS

View File

@ -44,6 +44,11 @@ jobs:
if: github.event_name != 'pull_request'
run: brew update || true
- name: unlink installed gcc to allow updating
run: |
brew unlink gcc@8
brew unlink gcc@9
- name: Install prerequisites
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas

2
.gitignore vendored
View File

@ -89,5 +89,7 @@ build.*
*.swp
benchmark/*.goto
benchmark/smallscaling
.vscode
CMakeCache.txt
CMakeFiles/*
.vscode

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 13)
set(OpenBLAS_PATCH_VERSION 14)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions
@ -14,6 +14,9 @@ include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
if(MSVC AND NOT DEFINED NOFORTRAN)
set(NOFORTRAN ON)
endif()
#######
if(MSVC)
@ -229,7 +232,7 @@ if (NOT NO_CBLAS)
add_subdirectory(utest)
endif()
if (NOT MSVC AND NOT NOFORTRAN)
if (NOT NOFORTRAN)
# Build test and ctest
add_subdirectory(test)
if(NOT NO_CBLAS)

View File

@ -1,4 +1,52 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.14
17-Mar-2021
common:
* Fixed a race condition on thread shutdown in non-OpenMP builds
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
* Improved performance of OMATCOPY_RT across all platforms
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
* Fixed potential misreading of the GCC compiler version in the build scripts
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
RISCV:
* Fixed compilation on RISCV (missing entry in getarch)
POWER:
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
* Added support for compilation on FreeBSD/ppc64le
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
* Improved SCOPY and CCOPY performance on POWER10
* Improved SGEMM and DGEMM performance on POWER10
* Added support for compilation with the NVIDIA HPC compiler
x86_64:
* Added an optimized bfloat16 GEMM kernel for Cooperlake
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
* Added support for compilation with the NAG Fortran compiler
* Fixed recognition of the AMD AOCC compiler
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
* Added support for running the BLAS/CBLAS tests on Windows
* Fixed signatures of the tls callback functions for Windows x64
* Fixed various issues with fma intrinsics support handling
ARM:
* Added support for embedded Cortex M targets via a new option EMBEDDED
ARMV8:
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
* Added support for the DYNAMIC_LIST option
* Added support for compilation with the NVIDIA HPC compiler
* Added support for compiling with the NAG Fortran compiler
====================================================================
Version 0.3.13
12-Dec-2020

View File

@ -59,6 +59,9 @@ endif
@$(CC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \
cverinfo=`$(CC) --version | sed -n '1p'`; \
if [ -z "$${cverinfo}" ]; then \
cverinfo=`$(CC) --version | sed -n '2p'`; \
fi; \
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
else \
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
@$(FC) --version > /dev/null 2>&1;\
if [ $$? -eq 0 ]; then \
fverinfo=`$(FC) --version | sed -n '1p'`; \
if [ -z "$${fverinfo}" ]; then \
fverinfo=`$(FC) --version | sed -n '2p'`; \
fi; \
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
else \
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\

View File

@ -1,28 +1,38 @@
ifneq ($(C_COMPILER), PGI)
ifeq ($(CORE), ARMV8)
CCOMMON_OPT += -march=armv8-a
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a
endif
endif
ifeq ($(CORE), CORTEXA53)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
endif
endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
endif
endif
ifeq ($(CORE), CORTEXA72)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
ifeq ($(CORE), CORTEXA73)
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
endif
endif
# Use a72 tunings because Neoverse-N1 is only available
# in GCC>=9
@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1)
ifeq ($(GCCVERSIONGTEQ7), 1)
ifeq ($(GCCVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
endif
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
endif
else
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
endif
endif
endif
ifeq ($(CORE), THUNDERX)
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
endif
endif
ifeq ($(CORE), FALKOR)
CCOMMON_OPT += -march=armv8-a -mtune=falkor
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8-a -mtune=falkor
endif
endif
ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
endif
ifeq ($(CORE), THUNDERX3T110)
ifeq ($(GCCVERSIONGTEQ10), 1)
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
endif
else
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif
endif
endif
ifeq ($(CORE), VORTEX)
CCOMMON_OPT += -march=armv8.3-a
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.3-a
endif
endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif
endif
endif
endif

View File

@ -10,9 +10,11 @@ USE_OPENMP = 1
endif
ifeq ($(CORE), POWER10)
ifneq ($(C_COMPILER), PGI)
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
endif
endif
ifeq ($(CORE), POWER9)
ifneq ($(C_COMPILER), PGI)

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.13
VERSION = 0.3.14
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

View File

@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
override ARCH=x86_64
else ifeq ($(ARCH), powerpc64)
override ARCH=power
else ifeq ($(ARCH), powerpc64le)
override ARCH=power
else ifeq ($(ARCH), powerpc)
override ARCH=power
else ifeq ($(ARCH), i386)
@ -181,7 +183,7 @@ endif
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
ifeq ($(HOSTARCH), x86_64)
ifeq ($(findstring pgcc,$(HOSTCC)),)
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
GETARCH_FLAGS += -march=native
endif
endif
@ -623,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
DYNAMIC_CORE += EMAG8180
DYNAMIC_CORE += THUNDERX3T110
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
endif
endif
ifeq ($(ARCH), mips64)
@ -663,6 +670,7 @@ endif
endif # ARCH zarch
ifeq ($(ARCH), power)
ifneq ($(C_COMPILER), PGI)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
@ -689,6 +697,10 @@ else
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
endif
endif
else
DYNAMIC_CORE = POWER8
DYNAMIC_CORE += POWER9
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
@ -847,9 +859,19 @@ endif
endif
ifeq ($(C_COMPILER), PGI)
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
NEWPGI := 1
endif
ifdef BINARY64
ifeq ($(ARCH), x86_64)
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
CCOMMON_OPT += -tp p7-64
ifneq ($(NEWPGI),1)
CCOMMON_OPT += -D__MMX__ -Mnollvm
endif
else
ifeq ($(ARCH), power)
ifeq ($(CORE), POWER8)
@ -877,13 +899,25 @@ endif
# Fortran Compiler dependent settings
#
ifeq ($(F_COMPILER), NAG)
FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -openmp
endif
endif
ifeq ($(F_COMPILER), FLANG)
CCOMMON_OPT += -DF_INTERFACE_FLANG
FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64)
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
ifeq ($(FLANG_VENDOR),AOCC)
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
ifeq ($(FLANG_VENDOR), AMD)
FCOMMON_OPT += -fno-unroll-loops
endif
endif
@ -1029,18 +1063,24 @@ ifeq ($(ARCH), x86_64)
FCOMMON_OPT += -tp p7-64
else
ifeq ($(ARCH), power)
ifeq ($(CORE), POWER6)
$(warning NVIDIA HPC compilers do not support POWER6.)
endif
ifeq ($(CORE), POWER8)
FCOMMON_OPT += -tp pwr8
endif
ifeq ($(CORE), POWER9)
FCOMMON_OPT += -tp pwr9
endif
ifeq ($(CORE), POWER10)
$(warning NVIDIA HPC compilers do not support POWER10.)
endif
endif
endif
else
FCOMMON_OPT += -tp p7
endif
FCOMMON_OPT += -Mrecursive
FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(USE_OPENMP), 1)
FCOMMON_OPT += -mp
endif
@ -1179,6 +1219,8 @@ CCOMMON_OPT += -fPIC
endif
ifeq ($(F_COMPILER), SUN)
FCOMMON_OPT += -pic
else ifeq ($(F_COMPILER), NAG)
FCOMMON_OPT += -PIC
else
FCOMMON_OPT += -fPIC
endif
@ -1256,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI
EXTRALIB += -lpapi -lperfctr
endif
ifdef BUFFERSIZE
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
endif
ifdef DYNAMIC_THREADS
CCOMMON_OPT += -DDYNAMIC_THREADS
endif
@ -1433,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS)
LAPACK_FPFLAGS := $(FPFLAGS)
endif
ifeq ($(F_COMPILER),NAG)
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64

View File

@ -10,40 +10,46 @@ endif
ifdef HAVE_SSE3
CCOMMON_OPT += -msse3
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -msse3
endif
endif
ifdef HAVE_SSSE3
CCOMMON_OPT += -mssse3
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mssse3
endif
endif
ifdef HAVE_SSE4_1
CCOMMON_OPT += -msse4.1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -msse4.1
endif
endif
ifndef OLDGCC
ifdef HAVE_AVX
CCOMMON_OPT += -mavx
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mavx
endif
endif
endif
ifndef NO_AVX2
ifdef HAVE_AVX2
CCOMMON_OPT += -mavx2
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -mavx2
endif
endif
ifndef OLDGCC
ifdef HAVE_FMA3
CCOMMON_OPT += -mfma
FCOMMON_OPT += -mfma
endif
endif
ifeq ($(CORE), SKYLAKEX)
ifndef DYNAMIC_ARCH
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
endif
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables

View File

@ -13,10 +13,14 @@ Drone CI: [![Build Status](https://cloud.drone.io/api/badges/xianyi/OpenBLAS/sta
## Introduction
OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version.
OpenBLAS is an optimized BLAS (Basic Linear Algebra Subprograms) library based on GotoBLAS2 1.13 BSD version.
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
## Binary Packages
We provide official binary packages for the following platform:
@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
- **AIX**: Supported on PPC up to POWER8
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
## Usage

View File

@ -30,10 +30,10 @@ environment:
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
matrix:
- COMPILER: clang-cl
WITH_FORTRAN: yes
WITH_FORTRAN: ON
- COMPILER: clang-cl
DYNAMIC_ARCH: ON
WITH_FORTRAN: no
WITH_FORTRAN: OFF
- COMPILER: cl
- COMPILER: MinGW64-gcc-7.2.0-mingw
DYNAMIC_ARCH: OFF
@ -47,12 +47,7 @@ environment:
install:
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
@ -68,15 +63,14 @@ before_build:
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
build_script:
- cmake --build .
test_script:
- echo Running Test
- cd utest
- openblas_utest
- ctest -j2

View File

@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
#elif defined(__APPLE__)
mach_timebase_info_data_t info;
uint64_t start = 0, stop = 0;
#else
struct timespec start = { 0, 0 }, stop = { 0, 0 };
#endif
@ -82,6 +85,9 @@ double getsec()
{
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
#elif defined(__APPLE__)
mach_timebase_info(&info);
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
#else
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
#endif
@ -90,6 +96,8 @@ double getsec()
void begin() {
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &start, (struct timezone *)0);
#elif defined(__APPLE__)
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
#else
clock_gettime(CLOCK_REALTIME, &start);
#endif
@ -98,6 +106,8 @@ void begin() {
void end() {
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &stop, (struct timezone *)0);
#elif defined(__APPLE__)
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
#else
clock_gettime(CLOCK_REALTIME, &stop);
#endif

View File

@ -1,11 +1,11 @@
#!/usr/bin/perl
#!/usr/bin/env perl
#use File::Basename;
# use File::Temp qw(tempfile);
# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
$hostarch = `uname -m | sed -e s/i.86/x86/`;
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
chop($hostarch);
$hostarch = "x86_64" if ($hostarch eq "amd64");

View File

@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
void cblas_srotg(float *a, float *b, float *c, float *s);
void cblas_drotg(double *a, double *b, double *c, double *s);
void cblas_crotg(void *a, void *b, float *c, void *s);
void cblas_zrotg(void *a, void *b, double *c, void *s);
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);

View File

@ -45,6 +45,9 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
endif ()
endif ()
if (POWER)

View File

@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC})
endforeach ()
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
include_directories(${lapacke_include_dir})
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")

View File

@ -148,16 +148,20 @@ endif ()
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
if (DEFINED TARGET)
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
# endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
endif()
endif()
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
@ -233,6 +237,11 @@ if (BINARY64)
endif ()
endif ()
if(EMBEDDED)
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
endif()
if (NEED_PIC)
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")

View File

@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN)
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
if (NOT "${line_match}" STREQUAL "")
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
endif ()
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
# message (STATUS "condition is true")
set (IfElse 1)

View File

@ -122,7 +122,7 @@ extern "C" {
#define ATOM GOTO_ATOM
#undef GOTO_ATOM
#endif
#else
#elif !defined(OS_EMBEDDED)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
@ -134,6 +134,9 @@ extern "C" {
#if defined(SMP) || defined(USE_LOCKING)
#include <pthread.h>
#endif
#else
#include <time.h>
#include <math.h>
#endif
#if defined(OS_SUNOS)
@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
#else
#elif !defined(OS_EMBEDDED)
struct timeval tv;
gettimeofday(&tv,NULL);
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
#else
return 0;
#endif
}
#define RPCC_DEFINED
@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
#include "common_linux.h"
#endif
#ifdef OS_EMBEDDED
#define DTB_DEFAULT_ENTRIES 64
#endif
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
#ifdef __NetBSD__

View File

@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define INLINE inline
#ifdef F_INTERFACE_FLANG
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
#define RETURN_BY_STACK
#else
#define RETURN_BY_COMPLEX

View File

@ -1418,6 +1418,15 @@ int get_cpuname(void){
case 9:
case 8:
switch (model) {
case 12: // Tiger Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
@ -1436,6 +1445,15 @@ int get_cpuname(void){
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
case 7: // Rocket Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
@ -2014,6 +2032,19 @@ int get_coretype(void){
#endif
else
return CORE_NEHALEM;
case 7:// Rocket Lake
#ifndef NO_AVX512
if(support_avx512())
return CORE_SKYLAKEX;
#endif
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 5:
switch (model) {
@ -2102,6 +2133,16 @@ int get_coretype(void){
break;
case 9:
case 8:
if (model == 12) { // Tiger Lake
if(support_avx512())
return CPUTYPE_SKYLAKEX;
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
if (model == 14) { // Kaby Lake
if(support_avx())
#ifndef NO_AVX2

View File

@ -5,9 +5,18 @@ enable_language(Fortran)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
if(WIN32)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
"$ErrorActionPreference = \"Stop\"\n"
"Get-Content $args[1] | & $args[0]\n"
)
set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
else()
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
"$1 < $2\n"
)
set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
endif()
foreach(float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char_upper)
@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES})
c_${float_char}blas1.c)
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
add_test(NAME "x${float_char}cblat1"
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
#level2
add_executable(x${float_char}cblat2
@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES})
constant.c)
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
add_test(NAME "x${float_char}cblat2"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
#level3
add_executable(x${float_char}cblat3
@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES})
constant.c)
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
add_test(NAME "x${float_char}cblat3"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
endforeach()

View File

@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp
endif
endif
ifeq ($(F_COMPILER), NAG)
CEXTRALIB = -lgomp
endif
endif
ifeq ($(BUILD_SINGLE),1)

View File

@ -1024,10 +1024,10 @@ int BLASFUNC(blas_thread_shutdown)(void){
int i;
if (!blas_server_avail) return 0;
LOCK_COMMAND(&server_lock);
if (blas_server_avail) {
for (i = 0; i < blas_num_threads - 1; i++) {
@ -1051,11 +1051,12 @@ int BLASFUNC(blas_thread_shutdown)(void){
}
#ifdef NEED_STACKATTR
pthread_attr_destory(&attr);
pthread_attr_destroy(&attr);
#endif
blas_server_avail = 0;
}
UNLOCK_COMMAND(&server_lock);
return 0;

View File

@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){
return NULL;
case 9:
case 8:
if (model == 12) { // Tiger Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
@ -667,6 +682,19 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) {
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:

View File

@ -43,6 +43,63 @@
#endif
extern gotoblas_t gotoblas_ARMV8;
#ifdef DYNAMIC_LIST
#ifdef DYN_CORTEXA53
extern gotoblas_t gotoblas_CORTEXA53;
#else
#define gotoblas_CORTEXA53 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA57
extern gotoblas_t gotoblas_CORTEXA57;
#else
#define gotoblas_CORTEXA57 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA72
extern gotoblas_t gotoblas_CORTEXA72;
#else
#define gotoblas_CORTEXA72 gotoblas_ARMV8
#endif
#ifdef DYN_CORTEXA73
extern gotoblas_t gotoblas_CORTEXA73;
#else
#define gotoblas_CORTEXA73 gotoblas_ARMV8
#endif
#ifdef DYN_FALKOR
extern gotoblas_t gotoblas_FALKOR;
#else
#define gotoblas_FALKOR gotoblas_ARMV8
#endif
#ifdef DYN_TSV110
extern gotoblas_t gotoblas_TSV110;
#else
#define gotoblas_TSV110 gotoblas_ARMV8
#endif
#ifdef DYN_THUNDERX
extern gotoblas_t gotoblas_THUNDERX;
#else
#define gotoblas_THUNDERX gotoblas_ARMV8
#endif
#ifdef DYN_THUNDERX2T99
extern gotoblas_t gotoblas_THUNDERX2T99;
#else
#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
#endif
#ifdef DYN_THUNDERX3T110
extern gotoblas_t gotoblas_THUNDERX3T110;
#else
#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
#endif
#ifdef DYN_EMAG8180
extern gotoblas_t gotoblas_EMAG8180;
#else
#define gotoblas_EMAG8180 gotoblas_ARMV8
#endif
#ifdef DYN_NEOVERSEN1
extern gotoblas_t gotoblas_NEOVERSEN1;
#else
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
#endif
#else
extern gotoblas_t gotoblas_CORTEXA53;
extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_CORTEXA72;
@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1;
extern gotoblas_t gotoblas_THUNDERX3T110;
#endif
extern void openblas_warning(int verbose, const char * msg);

View File

@ -27,7 +27,9 @@ static char *corename[] = {
#define NUM_CORETYPES 4
char *gotoblas_corename(void) {
#ifndef C_PGI
if (gotoblas == &gotoblas_POWER6) return corename[1];
#endif
if (gotoblas == &gotoblas_POWER8) return corename[2];
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
if (gotoblas == &gotoblas_POWER9) return corename[3];
@ -38,10 +40,164 @@ char *gotoblas_corename(void) {
return corename[0];
}
#if defined(__clang__)
static int __builtin_cpu_supports(char* arg)
{
return 0;
}
#endif
#if defined(C_PGI) || defined(__clang__)
/*
* NV HPC compilers do not yet implement __builtin_cpu_is().
* Fake a version here for use in the CPU detection code below.
*
* Strategy here is to first check the CPU to see what it actually is,
* and then test the input to see if what the CPU actually is matches
* what was requested.
*/
#include <string.h>
/*
* Define POWER processor version table.
*
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
*/
#define CPU_UNKNOWN 0
#define CPU_POWER5 5
#define CPU_POWER6 6
#define CPU_POWER8 8
#define CPU_POWER9 9
#define CPU_POWER10 10
static struct {
uint32_t pvr_mask;
uint32_t pvr_value;
const char* cpu_name;
uint32_t cpu_type;
} pvrPOWER [] = {
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
.pvr_mask = 0xffffffff,
.pvr_value = 0x0f000001,
.cpu_name = "POWER5+",
.cpu_type = CPU_POWER5,
},
{ /* Power6 aka POWER6X*/
.pvr_mask = 0xffff0000,
.pvr_value = 0x003e0000,
.cpu_name = "POWER6 (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power7 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x003f0000,
.cpu_name = "POWER7 (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power7+ */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004A0000,
.cpu_name = "POWER7+ (raw)",
.cpu_type = CPU_POWER6,
},
{ /* Power8E */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004b0000,
.cpu_name = "POWER8E (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power8NVL */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004c0000,
.cpu_name = "POWER8NVL (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power8 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004d0000,
.cpu_name = "POWER8 (raw)",
.cpu_type = CPU_POWER8,
},
{ /* Power9 DD2.0 */
.pvr_mask = 0xffffefff,
.pvr_value = 0x004e0200,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power9 DD 2.1 */
.pvr_mask = 0xffffefff,
.pvr_value = 0x004e0201,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power9 DD2.2 or later */
.pvr_mask = 0xffff0000,
.pvr_value = 0x004e0000,
.cpu_name = "POWER9 (raw)",
.cpu_type = CPU_POWER9,
},
{ /* Power10 */
.pvr_mask = 0xffff0000,
.pvr_value = 0x00800000,
.cpu_name = "POWER10 (raw)",
.cpu_type = CPU_POWER10,
},
{ /* End of table, pvr_mask and pvr_value must be zero */
.pvr_mask = 0x0,
.pvr_value = 0x0,
.cpu_name = "Unknown",
.cpu_type = CPU_UNKNOWN,
},
};
static int __builtin_cpu_is(const char *cpu) {
int i;
uint32_t pvr;
uint32_t cpu_type;
asm("mfpvr %0" : "=r"(pvr));
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
break;
}
}
#if defined(DEBUG)
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
#endif
cpu_type = pvrPOWER[i].cpu_type;
if (!strcmp(cpu, "power8"))
return cpu_type == CPU_POWER8;
if (!strcmp(cpu, "power9"))
return cpu_type == CPU_POWER9;
return 0;
}
#endif /* C_PGI */
static gotoblas_t *get_coretype(void) {
#ifndef C_PGI
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
return &gotoblas_POWER6;
#endif
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_POWER10;
#endif
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
if (__builtin_cpu_is("power10"))
return &gotoblas_POWER9;
#endif
@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) {
switch (found)
{
#ifndef C_PGI
case 1: return (&gotoblas_POWER6);
#endif
case 2: return (&gotoblas_POWER8);
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
case 3: return (&gotoblas_POWER9);

View File

@ -222,11 +222,11 @@ int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
#if defined(__GLIBC_PREREQ)
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
func = &memoryalloc[0];
while ((func != NULL) && (map_address == (void *) -1)) {
while ((*func != NULL) && (map_address == (void *) -1)) {
map_address = (*func)((void *)base_address);
@ -1619,10 +1619,12 @@ static int on_process_term(void)
#else
#pragma data_seg(".CRT$XLB")
#endif
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#ifdef _WIN64
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#pragma const_seg()
#else
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
#pragma data_seg()
#endif
@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
#else
#pragma data_seg(".CRT$XTU")
#endif
static int(*p_process_term)(void) = on_process_term;
#ifdef _WIN64
static const int(*p_process_term)(void) = on_process_term;
#pragma const_seg()
#else
static int(*p_process_term)(void) = on_process_term;
#pragma data_seg()
#endif
#endif
@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) {
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
#endif
#else
#elif !defined(OS_EMBEDDED)
#define ALLOC_MMAP
#define ALLOC_MALLOC
#else
#define ALLOC_MALLOC
inline int puts(const char *str) { return 0; }
inline int printf(const char *format, ...) { return 0; }
inline char *getenv(const char *name) { return ""; }
inline int atoi(const char *str) { return 0; }
#endif
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
# Changelog
# 2017/09/03 staticfloat

52
f_check
View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@ -32,9 +32,9 @@ if ($compiler eq "") {
"xlf95", "xlf90", "xlf",
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
"pathf90", "pathf95",
"pgf95", "pgf90", "pgf77",
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
"flang", "egfortran",
"ifort");
"ifort", "nagfor");
OUTER:
foreach $lists (@lists) {
@ -64,7 +64,9 @@ if ($compiler eq "") {
if (!$?) {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
}
if ($data =~ /zhoge_/) {
$bu = "_";
}
@ -76,6 +78,7 @@ if ($compiler eq "") {
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
$data =~ s/\(+.*?\)+//g;
$data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1;
$minor = $2;
@ -87,7 +90,7 @@ if ($compiler eq "") {
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} elsif ($compiler =~ /pgf/) {
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$openmp = "-mp";
} else {
@ -123,7 +126,7 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($data =~ /PGF/) {
if ($data =~ /PGF/ || $data =~ /NVF/) {
$vendor = PGI;
$openmp = "-mp";
}
@ -133,8 +136,16 @@ if ($compiler eq "") {
$openmp = "-openmp";
}
if ($data =~ /NAG/) {
$vendor = NAG;
$openmp = "-openmp";
}
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
if ($data eq "") {
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
}
if ($data =~ / zho_ge__/) {
$need2bu = 1;
}
@ -177,7 +188,7 @@ if ($compiler eq "") {
$openmp = "-mp";
}
if ($compiler =~ /pgf/) {
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
$vendor = PGI;
$bu = "_";
$openmp = "-mp";
@ -222,6 +233,12 @@ if ($compiler eq "") {
$openmp = "-fopenmp";
}
if ($compiler =~ /nagfor/) {
$vendor = NAG;
$bu = "_";
$openmp = "-openmp";
}
if ($vendor eq "") {
$nofortran = 1;
$compiler = "gfortran";
@ -275,14 +292,20 @@ if (!$?) {
if ($?) {
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
#For nagfor
if ($?) {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$binary = "" if ($?);
}
if ($binary eq "") {
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
}
}
if ( $vendor eq "NAG") {
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
}
$linker_L = "";
$linker_l = "";
$linker_a = "";
@ -330,12 +353,13 @@ if ($link ne "") {
$flags =~ s/\@/\,/g;
$linker_L .= "-Wl,". $flags . " " ;
}
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
$flags = "-lomp";
}
if (
($flags =~ /^\-l/)
&& ($flags !~ /ibrary/)
&& ($flags !~ /gfortranbegin/)
&& ($flags !~ /frtbegin/)
&& ($flags !~ /pathfstart/)
@ -352,6 +376,16 @@ if ($link ne "") {
$linker_l .= $flags . " ";
}
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
$linker_l .= $flags . " ";
}
$linker_a .= $flags . " " if $flags =~ /\.a$/;
}

View File

@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef __riscv
#include "cpuid_riscv64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __arm__

View File

@ -4,7 +4,7 @@
#else
#include "config_kernel.h"
#endif
#include "param.h"
#include "common.h"
int main(int argc, char **argv) {

View File

@ -316,7 +316,7 @@ CCBLAS1OBJS = \
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
cblas_caxpby.$(SUFFIX) \
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
CCBLAS2OBJS = \
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
@ -346,7 +346,7 @@ CZBLAS1OBJS = \
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
cblas_zaxpby.$(SUFFIX) \
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
CZBLAS2OBJS = \
@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

View File

@ -1,4 +1,4 @@
#!/usr/bin/perl
#!/usr/bin/env perl
$count = 0;

View File

@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#ifdef SMP
double MNK;
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
#endif
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
int nodes;
@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#ifdef SMP
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
#endif
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )

View File

@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
dq1 = dp1 * *dx1;
if(ABS(dq1) > ABS(dq2))
{
dflag = ZERO;
dh11 = ONE;
dh22 = ONE;
dh21 = - dy1 / *dx1;

View File

@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
endif ()
# Makefile.L3
set(USE_TRMM false)
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
set(USE_TRMM true)
endif ()
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
set(USE_TRMM true)
endif ()

View File

@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
ifeq ($(GCCVERSIONGTEQ10), 1)
override CFLAGS += -march=cooperlake
else
override CFLAGS += -march=skylake-avx512
override CFLAGS += -march=skylake-avx512 -mavx512f
endif
ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables
@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
endif
endif
else ifeq ($(TARGET_CORE), SKYLAKEX)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
ifeq ($(OSNAME), CYGWIN_NT)
override CFLAGS += -fno-asynchronous-unwind-tables
endif

View File

@ -1,3 +1,11 @@
FMAFLAG=
ifndef OLDGCC
ifdef HAVE_FMA3
FMAFLAG = -mfma
endif
endif
### AMAX ###
ifndef SAMAXKERNEL
@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@

View File

@ -1,5 +1,5 @@
/***************************************************************************
Copyright (c) 2013, The OpenBLAS Project
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
/*****************************************************
* 2014/06/09 Saar
*
* Order rowMajor
* Trans
*
******************************************************/
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i, j;
FLOAT *aptr,*bptr;
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
if ( rows <= 0 ) return(0);
if ( cols <= 0 ) return(0);
if (rows <= 0) return 0;
if (cols <= 0) return 0;
aptr = a;
a_offset = a;
b_offset = b;
i = (rows >> 2);
if (i > 0) {
do {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 4;
j = (cols >> 2);
if (j > 0) {
do {
/* Column 1 of MAT_B */
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
/* Column 2 of MAT_B */
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
/* Column 3 of MAT_B */
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
/* Column 4 of MAT_B */
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
} // if(j > 0)
if (cols & 2) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
b_offset1 += ldb*2;
for ( i=0; i<rows ; i++ )
{
bptr = &b[i];
for(j=0; j<cols; j++)
{
bptr[j*ldb] = alpha * aptr[j];
}
aptr += lda;
}
return(0);
if (cols & 1) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
}
i--;
} while (i > 0);
}
if (rows & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 2;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
b_offset1 += ldb*2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
}
} // if (rows & 2)
if (rows & 1) {
a_offset1 = a_offset;
a_offset += lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
a_offset1 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
a_offset1 += 2;
b_offset1 += ldb * 2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
}
}
return 0;
}

View File

@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
dot[0]=0.0;
dot[1]=0.0;
#if !defined(__PPC__) && !defined(__SunOS)
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
CREAL(result) = 0.0 ;
CIMAG(result) = 0.0 ;
#else
@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
i++ ;
}
#if !defined(__PPC__) && !defined(__SunOS)
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
CREAL(result) = dot[0];
CIMAG(result) = dot[1];
#else

View File

@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S

View File

@ -96,10 +96,19 @@ DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S

View File

@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S
SDOTKERNEL = dot_thunderx.c
DDOTKERNEL = ddot_thunderx.c
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S
SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S
SNRM2KERNEL = nrm2.S

View File

@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
#define CUR_MAXINV "d8"
#define CUR_MAXINV_V "v8.2d"
#define CUR_MAX_V "v8.2d"
#define REGINF "d9"
static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
double *ssq, double *scale)
@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" ble 9f //nrm2_kernel_L999 \n"
"1: //nrm2_kernel_F_BEGIN: \n"
" mov x6, #0x7FF0000000000000 //+Infinity \n"
" fmov "REGZERO", xzr \n"
" fmov "REGONE", #1.0 \n"
" fmov "REGINF", x6 \n"
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
" mov "J", "N" \n"
" cmp "J", xzr \n"
@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" ldr d4, ["X"] \n"
" fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" ldr d3, ["X", #8] \n"
" fabs d3, d3 \n"
" fmax "CUR_MAX", "SCALE", d3 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" fmaxp v24.2d, v24.2d, v26.2d \n"
" fmaxp v24.2d, v24.2d, v24.2d \n"
" fmax "CUR_MAX", "SCALE", d24 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
" //dup "CUR_MAX_V", v7.d[0] \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" fmaxp v24.2d, v24.2d, v26.2d \n"
" fmaxp v24.2d, v24.2d, v24.2d \n"
" fmax "CUR_MAX", "SCALE", d24 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
" //dup "CUR_MAX_V", v7.d[0] \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" ldr d4, ["X"] \n"
" fabs d4, d4 \n"
" fmax "CUR_MAX", "SCALE", d4 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
" ldr d3, ["X", #8] \n"
" fabs d3, d3 \n"
" fmax "CUR_MAX", "SCALE", d3 \n"
" fcmp "CUR_MAX", "REGINF" \n"
" beq 10f \n"
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
" fmul "SCALE", "SCALE", "SCALE" \n"
" fmul "SSQ", "SSQ", "SCALE" \n"
@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
"9: //nrm2_kernel_L999: \n"
" str "SSQ", [%[SSQ_]] \n"
" str "SCALE", [%[SCALE_]] \n"
" b 11f \n"
"10: \n"
" str "REGINF", [%[SSQ_]] \n"
" str "REGINF", [%[SCALE_]] \n"
"11: \n"
:
: [SSQ_] "r" (ssq), //%0
@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
[INCX_] "r" (inc_x) //%4
: "cc",
"memory",
"x0", "x1", "x2", "x3", "x4", "x5",
"x0", "x1", "x2", "x3", "x4", "x5", "x6",
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
);
@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
cur_ssq = *ptr;
cur_scale = *(ptr + 1);
if (cur_ssq == INFINITY) {
ssq = INFINITY;
scale = INFINITY;
break;
}
if (cur_scale != 0) {
if (cur_scale > scale) {
scale = (scale / cur_scale);

View File

@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c
SDOTKERNEL = sdot_power10.c
DDOTKERNEL = ddot_power10.c
DSDOTKERNEL = sdot_power10.c
ifneq ($(GCCVERSIONGTEQ9),1)
CDOTKERNEL = cdot_power9.S
else
CDOTKERNEL = cdot.c
endif
ZDOTKERNEL = zdot.c
#
SNRM2KERNEL = ../arm/nrm2.c

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL 1
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
{
__asm__
(
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"addi %3, %3, 256 \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
"stxv 41, 128(%3) \n\t"
"stxv 40, 144(%3) \n\t"
"stxv 43, 160(%3) \n\t"
"stxv 42, 176(%3) \n\t"
"stxv 45, 192(%3) \n\t"
"stxv 44, 208(%3) \n\t"
"stxv 47, 224(%3) \n\t"
"stxv 46, 240(%3) \n\t"
"#n=%1 x=%4=%2 y=%0=%3"
:
"=m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
);
}

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "copy_microk_power10.c"
#include "ccopy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL
@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
copy_kernel(n1, x, y);

View File

@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#include "common.h"
#if defined(POWER10)
#include "cdot_microk_power10.c"
#else
#ifndef HAVE_KERNEL_8
#include <altivec.h>
@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
}
#endif
#endif
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
if ((inc_x == 1) && (inc_y == 1)) {
#if defined(POWER10)
BLASLONG n1 = n & -16;
#else
BLASLONG n1 = n & -8;
#endif
BLASLONG j=0;
if (n1){

View File

@ -0,0 +1,177 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
{
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
__asm__
(
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"xxperm 56, 48, %x7 \n\t"
"xxperm 57, 49, %x7 \n\t"
"xxperm 58, 50, %x7 \n\t"
"xxperm 59, 51, %x7 \n\t"
"xxperm 60, 52, %x7 \n\t"
"xxperm 61, 53, %x7 \n\t"
"xxperm 62, 54, %x7 \n\t"
"xxperm 63, 55, %x7 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvp 48, 0(%3) \n\t"
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvp 50, 32(%3) \n\t"
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvp 40, 0(%2) \n\t"
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvp 42, 32(%2) \n\t"
"xxperm 56, 48, %x7 \n\t"
"xxperm 57, 49, %x7 \n\t"
"xxperm 58, 50, %x7 \n\t"
"xxperm 59, 51, %x7 \n\t"
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
"lxvp 52, 64(%3) \n\t"
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
"lxvp 54, 96(%3) \n\t"
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"lxvp 44, 64(%2) \n\t"
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvp 46, 96(%2) \n\t"
"xxperm 60, 52, %x7 \n\t"
"xxperm 61, 53, %x7 \n\t"
"xxperm 62, 54, %x7 \n\t"
"xxperm 63, 55, %x7 \n\t"
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 33, 33, 35 \n\t"
"xvaddsp 37, 37, 39 \n\t"
"xvaddsp 35, 32, 36 \n\t"
"xvaddsp 34, 33, 37 \n\t"
"xxswapd 32, 35 \n\t"
"xxswapd 33, 34 \n\t"
"xvaddsp 35, 35, 32 \n\t"
"xvaddsp 34, 34, 33 \n\t"
"xxpermdi 34, 34, 35, 2 \n\t"
"stxv 34, 0(%6) \n\t"
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
:
"=m" (*dot),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"m" (*x),
"m" (*y),
"b" (dot), // 6
"wa" (mask)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
"one%=: \n\t"
"stxvp 32, 0(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 34, 32(%3) \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 36, 64(%3) \n\t"
"lxvp 36, 64(%2) \n\t"
"stxvp 38, 96(%3) \n\t"
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"stxvp 40, 128(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"stxvp 42, 160(%3) \n\t"
"lxvp 42, 160(%2) \n\t"
"stxvp 44, 192(%3) \n\t"
"lxvp 44, 192(%2) \n\t"
"stxvp 46, 224(%3) \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"stxvp 48, 256(%3) \n\t"
"lxvp 48, 256(%2) \n\t"
"stxvp 50, 288(%3) \n\t"
"lxvp 50, 288(%2) \n\t"
"stxvp 52, 320(%3) \n\t"
"lxvp 52, 320(%2) \n\t"
"stxvp 54, 352(%3) \n\t"
"lxvp 48, 256(%2) \n\t"
"lxvp 50, 288(%2) \n\t"
"lxvp 52, 320(%2) \n\t"
"lxvp 54, 352(%2) \n\t"
"stxvp 56, 384(%3) \n\t"
"lxvp 56, 384(%2) \n\t"
"stxvp 58, 416(%3) \n\t"
"lxvp 58, 416(%2) \n\t"
"stxvp 60, 448(%3) \n\t"
"lxvp 60, 448(%2) \n\t"
"stxvp 62, 480(%3) \n\t"
"lxvp 56, 384(%2) \n\t"
"lxvp 58, 416(%2) \n\t"
"lxvp 60, 448(%2) \n\t"
"lxvp 62, 480(%2) \n\t"
"addi %3, %3, 512 \n\t"

View File

@ -0,0 +1,176 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
{
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
__asm__
(
"dcbt 0, %2 \n\t"
"xscvdpspn 32, %x3 \n\t"
"xxspltw 32, 32, 0 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmulsp 49, 41, 32 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"xxperm 34, 40, %x5 \n\t"
"xxperm 35, 41, %x5 \n\t"
"xxperm 36, 42, %x5 \n\t"
"xxperm 37, 43, %x5 \n\t"
"xxperm 38, 44, %x5 \n\t"
"xxperm 39, 45, %x5 \n\t"
"xxperm 56, 46, %x5 \n\t"
"xxperm 57, 47, %x5 \n\t"
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmulsp 35, 35, %x4 \n\t"
"lxvp 40, 128(%2) \n\t"
"xvmulsp 36, 36, %x4 \n\t"
"xvmulsp 37, 37, %x4 \n\t"
"lxvp 42, 160(%2) \n\t"
"xvmulsp 38, 38, %x4 \n\t"
"xvmulsp 39, 39, %x4 \n\t"
"lxvp 44, 192(%2) \n\t"
"xvmulsp 56, 56, %x4 \n\t"
"xvmulsp 57, 57, %x4 \n\t"
"lxvp 46, 224(%2) \n\t"
"xvaddsp 48, 48, 34 \n\t"
"xvaddsp 49, 49, 35 \n\t"
"xvaddsp 50, 50, 36 \n\t"
"xvaddsp 51, 51, 37 \n\t"
"stxvp 48, 0(%2) \n\t"
"xvaddsp 52, 52, 38 \n\t"
"xvaddsp 53, 53, 39 \n\t"
"stxvp 50, 32(%2) \n\t"
"xvaddsp 54, 54, 56 \n\t"
"xvaddsp 55, 55, 57 \n\t"
"stxvp 52, 64(%2) \n\t"
"stxvp 54, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmulsp 49, 41, 32 \n\t"
"xvmulsp 50, 42, 32 \n\t"
"xvmulsp 51, 43, 32 \n\t"
"xvmulsp 52, 44, 32 \n\t"
"xvmulsp 53, 45, 32 \n\t"
"xvmulsp 54, 46, 32 \n\t"
"xvmulsp 55, 47, 32 \n\t"
"xxperm 34, 40, %x5 \n\t"
"xxperm 35, 41, %x5 \n\t"
"xxperm 36, 42, %x5 \n\t"
"xxperm 37, 43, %x5 \n\t"
"xxperm 38, 44, %x5 \n\t"
"xxperm 39, 45, %x5 \n\t"
"xxperm 56, 46, %x5 \n\t"
"xxperm 57, 47, %x5 \n\t"
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmulsp 35, 35, %x4 \n\t"
"xvmulsp 36, 36, %x4 \n\t"
"xvmulsp 37, 37, %x4 \n\t"
"xvmulsp 38, 38, %x4 \n\t"
"xvmulsp 39, 39, %x4 \n\t"
"xvmulsp 56, 56, %x4 \n\t"
"xvmulsp 57, 57, %x4 \n\t"
"xvaddsp 48, 48, 34 \n\t"
"xvaddsp 49, 49, 35 \n\t"
"xvaddsp 50, 50, 36 \n\t"
"xvaddsp 51, 51, 37 \n\t"
"stxvp 48, 0(%2) \n\t"
"xvaddsp 52, 52, 38 \n\t"
"xvaddsp 53, 53, 39 \n\t"
"stxvp 50, 32(%2) \n\t"
"xvaddsp 54, 54, 56 \n\t"
"xvaddsp 55, 55, 57 \n\t"
"stxvp 52, 64(%2) \n\t"
"stxvp 54, 96(%2) \n\t"
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
"f" (alpha_r), // 3
"wa" (t0), // 4
"wa" (mask) // 5
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57"
);
}

View File

@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "cswap_microk_power8.c"
#elif defined(POWER10)
#include "cswap_microk_power10.c"
#endif
#endif

View File

@ -0,0 +1,127 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if defined(DOUBLE)
#define HAVE_KERNEL_16 1
static void zswap_kernel_16 (long n, double *x, double *y)
#else
#define HAVE_KERNEL_32 1
static void cswap_kernel_32 (long n, float *x, float *y)
#endif
{
__asm__
(
".align 5 \n"
"one%=: \n\t"
"lxvp 32, 0(%4) \n\t"
"lxvp 34, 32(%4) \n\t"
"lxvp 36, 64(%4) \n\t"
"lxvp 38, 96(%4) \n\t"
"lxvp 40, 128(%4) \n\t"
"lxvp 42, 160(%4) \n\t"
"lxvp 44, 192(%4) \n\t"
"lxvp 46, 224(%4) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"lxvp 56, 128(%3) \n\t"
"lxvp 58, 160(%3) \n\t"
"lxvp 60, 192(%3) \n\t"
"lxvp 62, 224(%3) \n\t"
"stxv 33, 0(%3) \n\t"
"stxv 32, 16(%3) \n\t"
"stxv 35, 32(%3) \n\t"
"stxv 34, 48(%3) \n\t"
"stxv 37, 64(%3) \n\t"
"stxv 36, 80(%3) \n\t"
"stxv 39, 96(%3) \n\t"
"stxv 38, 112(%3) \n\t"
"addi %3, %3, 128 \n\t"
"stxv 41, 0(%3) \n\t"
"stxv 40, 16(%3) \n\t"
"stxv 43, 32(%3) \n\t"
"stxv 42, 48(%3) \n\t"
"stxv 45, 64(%3) \n\t"
"stxv 44, 80(%3) \n\t"
"stxv 47, 96(%3) \n\t"
"stxv 46, 112(%3) \n\t"
"addi %3, %3, 128 \n\t"
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 50, 48(%4) \n\t"
"stxv 53, 64(%4) \n\t"
"stxv 52, 80(%4) \n\t"
"stxv 55, 96(%4) \n\t"
"stxv 54, 112(%4) \n\t"
"addi %4, %4, 128 \n\t"
"stxv 57, 0(%4) \n\t"
"stxv 56, 16(%4) \n\t"
"stxv 59, 32(%4) \n\t"
"stxv 58, 48(%4) \n\t"
"stxv 61, 64(%4) \n\t"
"stxv 60, 80(%4) \n\t"
"stxv 63, 96(%4) \n\t"
"stxv 62, 112(%4) \n\t"
"addi %4, %4, 128 \n\t"
#if defined(DOUBLE)
"addic. %2, %2, -16 \n\t"
#else
"addic. %2, %2, -32 \n\t"
#endif
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4"
:
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
:
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dasum_microk_power8.c"
#elif defined(POWER10)
#include "dasum_microk_power10.c"
#endif
#endif
@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
sumf += ABS(x[i]);
}
}
n1 = (n-i) & -16;
if ( n1 > 0 )
{
sumf += dasum_kernel_16(n1, &x[i]);
i+=n1;
}
#else
n1 = n & -16;
if ( n1 > 0 )
{
@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
sumf = dasum_kernel_16(n1, x);
i=n1;
}
#endif
while(i < n)
{

View File

@ -0,0 +1,152 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static double dasum_kernel_16 (long n, double *x)
{
double sum;
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__asm__
(
"dcbt 0, %2 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"lxvp 40, 0(%2) \n\t"
"xvabsdp %x3, 44 \n\t"
"xvabsdp %x4, 45 \n\t"
"lxvp 42, 32(%2) \n\t"
"xvabsdp %x5, 46 \n\t"
"xvabsdp %x6, 47 \n\t"
"lxvp 44, 64(%2) \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"lxvp 46, 96(%2) \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, %x4 \n\t"
"addic. %1, %1, -16 \n\t"
"xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t"
"xvabsdp 50, 42 \n\t"
"xvabsdp 51, 43 \n\t"
"xvabsdp %x3, 44 \n\t"
"xvabsdp %x4, 45 \n\t"
"xvabsdp %x5, 46 \n\t"
"xvabsdp %x6, 47 \n\t"
"xvadddp 32, 32, 48 \n\t"
"xvadddp 33, 33, 49 \n\t"
"xvadddp 34, 34, 50 \n\t"
"xvadddp 35, 35, 51 \n\t"
"xvadddp 36, 36, %x3 \n\t"
"xvadddp 37, 37, %x4 \n\t"
"xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t"
"xvadddp 32, 32, 33 \n\t"
"xvadddp 34, 34, 35 \n\t"
"xvadddp 36, 36, 37 \n\t"
"xvadddp 38, 38, 39 \n\t"
"xvadddp 32, 32, 34 \n\t"
"xvadddp 36, 36, 38 \n\t"
"xvadddp 32, 32, 36 \n\t"
XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
:
"=d" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
);
return sum;
}

View File

@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -64;
if ( n1 > 0 )
if ( n >= 64 )
{
copy_kernel(n1, x, y);
i=n1;
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
y[i] = x[i] ;
}
}
BLASLONG n1 = (n-i) & -64;
if ( n1 )
{
copy_kernel(n1, &x[i], &y[i]);
i += n1;
}
while(i < n)

View File

@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
typedef __vector unsigned char vec_t;
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
#if !__has_builtin(__builtin_vsx_assemble_pair)
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
#endif
#if !__has_builtin(__builtin_vsx_disassemble_pair)
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
#endif
#ifdef TRMMKERNEL
#define SAVE_ACC(ACC, J) \
@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
vec_t *rb = (vec_t *) & BO[0];
__vector_pair rowB, rowB1;
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{
rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{
rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB, rowB1;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 3];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
}
@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
{
rowA = (vec_t *) & AO[l << 3];
rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 2];
rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
}
@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
vec_t *rowA = (vec_t *) & AO[0];
__vector_pair rowB;
vec_t *rb = (vec_t *) & BO[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++)
{
rowA = (vec_t *) & AO[l << 1];
rb = (vec_t *) & BO[l << 2];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}
SAVE_ACC (&acc0, 0);
@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4];
__vector_quad acc0, acc1, acc2, acc3;
BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
vec_t *rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
vec_t *rb = (vec_t *) & BO[0];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
for (l = 1; l < temp; l++)
{
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rb = (vec_t *) & BO[l << 1];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 3];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4];
__vector_quad acc0, acc1;
BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
vec_t *rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
vec_t *rb = (vec_t *) & BO[0];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
for (l = 1; l < temp; l++)
{
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rb = (vec_t *) & BO[l << 1];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 2];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
v4sf_t result[4];
__vector_quad acc0;
BLASLONG l = 0;
FLOAT t[4] = { 0, 0, 0, 0 };
t[0] = BO[0], t[1] = BO[1];
__vector_pair rowB;
vec_t *rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
vec_t *rb = (vec_t *) & BO[0];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
vec_t *rowA = (vec_t *) & AO[0];
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
for (l = 1; l < temp; l++)
{
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
rb = (vec_t *) & t[0];
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
rb = (vec_t *) & BO[l << 1];
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
rowA = (vec_t *) & AO[l << 1];
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
}

View File

@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "drot_microk_power8.c"
#elif defined(POWER10)
#include "drot_microk_power10.c"
#endif
#endif
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
}
}
BLASLONG n1 = (n-i) & -16;
if ( n1 > 0 )
{
drot_kernel_16(n1,&x[i], &y[i], c, s);
i+=n1;
}
#else
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
drot_kernel_16(n1, x1, y1, c, s);
i=n1;
}
#endif
while(i < n)
{

View File

@ -0,0 +1,148 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
{
__asm__
(
XXSPLTD_S(36,%x5,0) // load c to both dwords
XXSPLTD_S(37,%x6,0) // load s to both dwords
"lxvp 32, 0(%3) \n\t" // load x
"lxvp 34, 32(%3) \n\t"
"lxvp 48, 0(%4) \n\t" // load y
"lxvp 50, 32(%4) \n\t"
"addic. %2, %2, -8 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
"xvmuldp 42, 34, 36 \n\t"
"xvmuldp 43, 35, 36 \n\t"
"xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 45, 33, 37 \n\t"
"xvmuldp 46, 34, 37 \n\t"
"xvmuldp 47, 35, 37 \n\t"
"lxvp 32, 64(%3) \n\t" // load x
"lxvp 34, 96(%3) \n\t"
"xvmuldp 52, 48, 36 \n\t" // c * y
"xvmuldp 53, 49, 36 \n\t"
"xvmuldp 54, 50, 36 \n\t"
"xvmuldp 55, 51, 36 \n\t"
"xvmuldp 38, 48, 37 \n\t" // s * y
"xvmuldp 39, 49, 37 \n\t"
"xvmuldp 56, 50, 37 \n\t"
"xvmuldp 57, 51, 37 \n\t"
"lxvp 48, 64(%4) \n\t" // load y
"lxvp 50, 96(%4) \n\t"
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
"stxvp 40, 0(%3) \n\t" // store x
"stxvp 42, 32(%3) \n\t"
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
"stxvp 52, 0(%4) \n\t" // store y
"stxvp 54, 32(%4) \n\t"
"addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t"
"addic. %2, %2, -8 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t"
"xvmuldp 42, 34, 36 \n\t"
"xvmuldp 43, 35, 36 \n\t"
"xvmuldp 52, 48, 36 \n\t" // c * y
"xvmuldp 53, 49, 36 \n\t"
"xvmuldp 54, 50, 36 \n\t"
"xvmuldp 55, 51, 36 \n\t"
"xvmuldp 44, 32, 37 \n\t" // s * x
"xvmuldp 45, 33, 37 \n\t"
"xvmuldp 46, 34, 37 \n\t"
"xvmuldp 47, 35, 37 \n\t"
"xvmuldp 38, 48, 37 \n\t" // s * y
"xvmuldp 39, 49, 37 \n\t"
"xvmuldp 56, 50, 37 \n\t"
"xvmuldp 57, 51, 37 \n\t"
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
"stxvp 40, 0(%3) \n\t" // store x
"stxvp 42, 32(%3) \n\t"
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
"stxvp 52, 0(%4) \n\t" // store y
"stxvp 54, 32(%4) \n\t"
"#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
:
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
:
"d" (c), // 5
"d" (s) // 6
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57"
);
}

View File

@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dscal_microk_power8.c"
#elif defined(POWER10)
#include "dscal_microk_power10.c"
#endif
#endif
@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (j = 0; j < align; j++) {
x[j] = 0.0;
}
}
BLASLONG n1 = (n-j) & -16;
if ( n1 > 0 )
{
dscal_kernel_8_zero(n1, &x[j]);
j+=n1;
}
#else
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_8_zero(n1, x);
j=n1;
}
#endif
while(j < n)
{
@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
for (j = 0; j < align; j++) {
x[j] = da * x[j];
}
}
BLASLONG n1 = (n-j) & -16;
if ( n1 > 0 )
{
dscal_kernel_8(n1, &x[j], da);
j+=n1;
}
#else
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
dscal_kernel_8(n1, x, da);
j=n1;
}
#endif
while(j < n)
{

View File

@ -0,0 +1,134 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void dscal_kernel_8 (long n, double *x, double alpha)
{
__asm__
(
"dcbt 0, %2 \n\t"
XXSPLTD_S(48,%x3,0)
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 40, 32, 48 \n\t"
"xvmuldp 41, 33, 48 \n\t"
"xvmuldp 42, 34, 48 \n\t"
"xvmuldp 43, 35, 48 \n\t"
"lxvp 32, 128(%2) \n\t"
"lxvp 34, 160(%2) \n\t"
"xvmuldp 44, 36, 48 \n\t"
"xvmuldp 45, 37, 48 \n\t"
"xvmuldp 46, 38, 48 \n\t"
"xvmuldp 47, 39, 48 \n\t"
"lxvp 36, 192(%2) \n\t"
"lxvp 38, 224(%2) \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmuldp 40, 32, 48 \n\t"
"xvmuldp 41, 33, 48 \n\t"
"xvmuldp 42, 34, 48 \n\t"
"xvmuldp 43, 35, 48 \n\t"
"xvmuldp 44, 36, 48 \n\t"
"xvmuldp 45, 37, 48 \n\t"
"xvmuldp 46, 38, 48 \n\t"
"xvmuldp 47, 39, 48 \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
"#n=%1 alpha=%3 x=%0=%2"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
"d" (alpha) // 3
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
);
}
static void dscal_kernel_8_zero (long n, double *x)
{
__asm__
(
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%2) \n\t"
"stxvp 32, 32(%2) \n\t"
"stxvp 32, 64(%2) \n\t"
"stxvp 32, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"#n=%1 x=%0=%2 "
:
"=m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
:
"cr0","vs32","vs33"
);
}

View File

@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "dswap_microk_power8.c"
#elif defined(POWER10)
#include "swap_microk_power10.c"
#endif
#endif
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
}
}
BLASLONG n1 = (n-i) & -32;
if ( n1 > 0 )
{
dswap_kernel_32(n1,&x[i], &y[i]);
i+=n1;
}
#else
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
dswap_kernel_32(n1, x, y);
i=n1;
}
#endif
while(i < n)
{

View File

@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sasum_microk_power8.c"
#elif defined(POWER10)
#include "sasum_microk_power10.c"
#endif
#endif
@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( inc_x == 1 )
{
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
sumf += ABS(x[i]);
}
}
n1 = (n-i) & -32;
if ( n1 > 0 )
{
sumf += sasum_kernel_32(n1, &x[i]);
i+=n1;
}
#else
n1 = n & -32;
if ( n1 > 0 )
{
@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
sumf = sasum_kernel_32(n1, x);
i=n1;
}
#endif
while(i < n)
{

View File

@ -0,0 +1,153 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_32 1
static float sasum_kernel_32 (long n, float *x)
{
float sum;
__vector float t0;
__vector float t1;
__vector float t2;
__vector float t3;
__asm__
(
"dcbt 0, %2 \n\t"
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
"xxlxor 34, 34, 34 \n\t"
"xxlxor 35, 35, 35 \n\t"
"xxlxor 36, 36, 36 \n\t"
"xxlxor 37, 37, 37 \n\t"
"xxlxor 38, 38, 38 \n\t"
"xxlxor 39, 39, 39 \n\t"
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"lxvp 40, 0(%2) \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
"lxvp 42, 32(%2) \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
"lxvp 44, 64(%2) \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"lxvp 46, 96(%2) \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"addi %2, %2, 128 \n\t"
"xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, %x4 \n\t"
"addic. %1, %1, -32 \n\t"
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t"
"xvabssp 50, 42 \n\t"
"xvabssp 51, 43 \n\t"
"xvabssp %x3, 44 \n\t"
"xvabssp %x4, 45 \n\t"
"xvabssp %x5, 46 \n\t"
"xvabssp %x6, 47 \n\t"
"xvaddsp 32, 32, 48 \n\t"
"xvaddsp 33, 33, 49 \n\t"
"xvaddsp 34, 34, 50 \n\t"
"xvaddsp 35, 35, 51 \n\t"
"xvaddsp 36, 36, %x3 \n\t"
"xvaddsp 37, 37, %x4 \n\t"
"xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xvaddsp 34, 34, 35 \n\t"
"xvaddsp 36, 36, 37 \n\t"
"xvaddsp 38, 38, 39 \n\t"
"xvaddsp 32, 32, 34 \n\t"
"xvaddsp 36, 36, 38 \n\t"
"xvaddsp 32, 32, 36 \n\t"
"xxsldwi 33, 32, 32, 2 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xxsldwi 33, 32, 32, 1 \n\t"
"xvaddsp 32, 32, 33 \n\t"
"xscvspdp %x0, 32 \n"
"#n=%1 x=%3=%2 sum=%0\n"
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
:
"=f" (sum), // 0
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3) // 6
:
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
);
return sum;
}

View File

@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
if ( (inc_x == 1) && (inc_y == 1 ))
{
BLASLONG n1 = n & -128;
if ( n1 > 0 )
if ( n >= 128 )
{
copy_kernel (n1, x, y);
i=n1;
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
y[i] = x[i] ;
}
}
BLASLONG n1 = (n-i) & -128;
if ( n1 )
{
copy_kernel(n1, &x[i], &y[i]);
i += n1;
}
while(i < n)

View File

@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "srot_microk_power8.c"
#elif defined(POWER10)
#include "srot_microk_power10.c"
#endif
#endif
@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
if ( (inc_x == 1) && (inc_y == 1) )
{
#if defined(POWER10)
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
temp = c*x[i] + s*y[i] ;
y[i] = c*y[i] - s*x[i] ;
x[i] = temp ;
}
}
BLASLONG n1 = (n-i) & -16;
if ( n1 > 0 )
{
srot_kernel_16(n1, &x1[i], &y1[i], c, s);
i+=n1;
}
#else
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
i=n1;
}
#endif
while(i < n)
{
temp = c*x[i] + s*y[i] ;

View File

@ -0,0 +1,151 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
{
__asm__
(
"xscvdpspn 36, %x5 \n\t" // load c to all words
"xxspltw 36, 36, 0 \n\t"
"xscvdpspn 37, %x6 \n\t" // load s to all words
"xxspltw 37, 37, 0 \n\t"
"lxvp 32, 0(%3) \n\t" // load x
"lxvp 34, 32(%3) \n\t"
"lxvp 48, 0(%4) \n\t" // load y
"lxvp 50, 32(%4) \n\t"
"addic. %2, %2, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t"
"xvmulsp 43, 35, 36 \n\t"
"xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t"
"xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t"
"lxvp 32, 64(%3) \n\t" // load x
"lxvp 34, 96(%3) \n\t"
"xvmulsp 52, 48, 36 \n\t" // c * y
"xvmulsp 53, 49, 36 \n\t"
"xvmulsp 54, 50, 36 \n\t"
"xvmulsp 55, 51, 36 \n\t"
"xvmulsp 38, 48, 37 \n\t" // s * y
"xvmulsp 39, 49, 37 \n\t"
"xvmulsp 56, 50, 37 \n\t"
"xvmulsp 57, 51, 37 \n\t"
"lxvp 48, 64(%4) \n\t" // load y
"lxvp 50, 96(%4) \n\t"
"xvaddsp 40, 40, 38 \n\t" // c * x + s * y
"xvaddsp 41, 41, 39 \n\t" // c * x + s * y
"xvaddsp 42, 42, 56 \n\t" // c * x + s * y
"xvaddsp 43, 43, 57 \n\t" // c * x + s * y
"stxvp 40, 0(%3) \n\t" // store x
"stxvp 42, 32(%3) \n\t"
"xvsubsp 52, 52, 44 \n\t" // c * y - s * x
"xvsubsp 53, 53, 45 \n\t" // c * y - s * x
"xvsubsp 54, 54, 46 \n\t" // c * y - s * x
"xvsubsp 55, 55, 47 \n\t" // c * y - s * x
"stxvp 52, 0(%4) \n\t" // store y
"stxvp 54, 32(%4) \n\t"
"addi %3, %3, 64 \n\t"
"addi %4, %4, 64 \n\t"
"addic. %2, %2, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t"
"xvmulsp 43, 35, 36 \n\t"
"xvmulsp 52, 48, 36 \n\t" // c * y
"xvmulsp 53, 49, 36 \n\t"
"xvmulsp 54, 50, 36 \n\t"
"xvmulsp 55, 51, 36 \n\t"
"xvmulsp 44, 32, 37 \n\t" // s * x
"xvmulsp 45, 33, 37 \n\t"
"xvmulsp 46, 34, 37 \n\t"
"xvmulsp 47, 35, 37 \n\t"
"xvmulsp 38, 48, 37 \n\t" // s * y
"xvmulsp 39, 49, 37 \n\t"
"xvmulsp 56, 50, 37 \n\t"
"xvmulsp 57, 51, 37 \n\t"
"xvaddsp 40, 40, 38 \n\t" // c * x + s * y
"xvaddsp 41, 41, 39 \n\t" // c * x + s * y
"xvaddsp 42, 42, 56 \n\t" // c * x + s * y
"xvaddsp 43, 43, 57 \n\t" // c * x + s * y
"stxvp 40, 0(%3) \n\t" // store x
"stxvp 42, 32(%3) \n\t"
"xvsubsp 52, 52, 44 \n\t" // c * y - s * x
"xvsubsp 53, 53, 45 \n\t" // c * y - s * x
"xvsubsp 54, 54, 46 \n\t" // c * y - s * x
"xvsubsp 55, 55, 47 \n\t" // c * y - s * x
"stxvp 52, 0(%4) \n\t" // store y
"stxvp 54, 32(%4) \n\t"
"#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
:
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
:
"f" (c), // 5
"f" (s) // 6
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57"
);
}

View File

@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sscal_microk_power8.c"
#elif defined(POWER10)
#include "sscal_microk_power10.c"
#endif
#endif
@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( da == 0.0 )
{
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
for (j = 0; j < align; j++) {
x[j] = 0.0;
}
}
BLASLONG n1 = (n-j) & -32;
if ( n1 > 0 )
{
sscal_kernel_16_zero(n1, &x[j]);
j+=n1;
}
#else
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_16_zero(n1, x);
j=n1;
}
#endif
while(j < n)
{
@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
else
{
#if defined(POWER10)
if ( n >= 32 )
{
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
for (j = 0; j < align; j++) {
x[j] = da * x[j];
}
}
BLASLONG n1 = (n-j) & -32;
if ( n1 > 0 )
{
sscal_kernel_16(n1, &x[j], da);
j+=n1;
}
#else
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sscal_kernel_16(n1, x, da);
j=n1;
}
#endif
while(j < n)
{

View File

@ -0,0 +1,135 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_16 1
static void sscal_kernel_16 (long n, float *x, float alpha)
{
__asm__
(
"dcbt 0, %2 \n\t"
"xscvdpspn 48, %x3 \n\t"
"xxspltw 48, 48, 0 \n\t"
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 36, 64(%2) \n\t"
"lxvp 38, 96(%2) \n\t"
"addic. %1, %1, -32 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmulsp 40, 32, 48 \n\t"
"xvmulsp 41, 33, 48 \n\t"
"xvmulsp 42, 34, 48 \n\t"
"xvmulsp 43, 35, 48 \n\t"
"lxvp 32, 128(%2) \n\t"
"lxvp 34, 160(%2) \n\t"
"xvmulsp 44, 36, 48 \n\t"
"xvmulsp 45, 37, 48 \n\t"
"xvmulsp 46, 38, 48 \n\t"
"xvmulsp 47, 39, 48 \n\t"
"lxvp 36, 192(%2) \n\t"
"lxvp 38, 224(%2) \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmulsp 40, 32, 48 \n\t"
"xvmulsp 41, 33, 48 \n\t"
"xvmulsp 42, 34, 48 \n\t"
"xvmulsp 43, 35, 48 \n\t"
"xvmulsp 44, 36, 48 \n\t"
"xvmulsp 45, 37, 48 \n\t"
"xvmulsp 46, 38, 48 \n\t"
"xvmulsp 47, 39, 48 \n\t"
"stxvp 40, 0(%2) \n\t"
"stxvp 42, 32(%2) \n\t"
"stxvp 44, 64(%2) \n\t"
"stxvp 46, 96(%2) \n\t"
"#n=%1 alpha=%3 x=%0=%2"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
"f" (alpha) // 3
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
);
}
static void sscal_kernel_16_zero (long n, float *x)
{
__asm__
(
"xxlxor 32, 32, 32 \n\t"
"xxlxor 33, 33, 33 \n\t"
".align 5 \n"
"one%=: \n\t"
"stxvp 32, 0(%2) \n\t"
"stxvp 32, 32(%2) \n\t"
"stxvp 32, 64(%2) \n\t"
"stxvp 32, 96(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t"
"bgt one%= \n"
"#n=%1 x=%0=%2 "
:
"=m" (*x),
"+r" (n), // 1
"+b" (x) // 2
:
:
"cr0","vs32","vs33"
);
}

View File

@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "sswap_microk_power8.c"
#elif defined(POWER10)
#include "swap_microk_power10.c"
#endif
#endif
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
if ( (inc_x == 1) && (inc_y == 1 ))
{
#if defined(POWER10)
if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
temp = y[i];
y[i] = x[i];
x[i] = temp;
}
}
BLASLONG n1 = (n-i) & -64;
if ( n1 > 0 )
{
sswap_kernel_32(n1,&x[i], &y[i]);
i+=n1;
}
#else
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
sswap_kernel_32(n1, x, y);
i=n1;
}
#endif
while(i < n)
{

View File

@ -0,0 +1,105 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_32 1
#if defined(DOUBLE)
static void dswap_kernel_32 (long n, double *x, double *y)
#else
static void sswap_kernel_32 (long n, float *x, float *y)
#endif
{
__asm__
(
".align 5 \n"
"one%=: \n\t"
"lxvp 32, 0(%4) \n\t"
"lxvp 34, 32(%4) \n\t"
"lxvp 36, 64(%4) \n\t"
"lxvp 38, 96(%4) \n\t"
"lxvp 40, 128(%4) \n\t"
"lxvp 42, 160(%4) \n\t"
"lxvp 44, 192(%4) \n\t"
"lxvp 46, 224(%4) \n\t"
"lxvp 48, 0(%3) \n\t"
"lxvp 50, 32(%3) \n\t"
"lxvp 52, 64(%3) \n\t"
"lxvp 54, 96(%3) \n\t"
"lxvp 56, 128(%3) \n\t"
"lxvp 58, 160(%3) \n\t"
"lxvp 60, 192(%3) \n\t"
"lxvp 62, 224(%3) \n\t"
"stxvp 32, 0(%3) \n\t"
"stxvp 34, 32(%3) \n\t"
"stxvp 36, 64(%3) \n\t"
"stxvp 38, 96(%3) \n\t"
"stxvp 40, 128(%3) \n\t"
"stxvp 42, 160(%3) \n\t"
"stxvp 44, 192(%3) \n\t"
"stxvp 46, 224(%3) \n\t"
"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 52, 64(%4) \n\t"
"stxvp 54, 96(%4) \n\t"
"stxvp 56, 128(%4) \n\t"
"stxvp 58, 160(%4) \n\t"
"stxvp 60, 192(%4) \n\t"
"stxvp 62, 224(%4) \n\t"
"addi %4, %4, 256 \n\t"
"addi %3, %3, 256 \n\t"
#if defined(DOUBLE)
"addic. %2, %2, -32 \n\t"
#else
"addic. %2, %2, -64 \n\t"
#endif
"bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4"
:
"+m" (*x),
"+m" (*y),
"+r" (n), // 2
"+b" (x), // 3
"+b" (y) // 4
:
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#pragma GCC optimize "O1"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#if defined(DOUBLE)
#include "zscal_microk_power8.c"
#endif
#elif defined(POWER10)
#if defined(DOUBLE)
#include "zscal_microk_power10.c"
#else
#include "cscal_microk_power10.c"
#endif
#endif
#endif
@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
{
#if defined(DOUBLE)
n1 = n & -8;
#else
n1 = n & -16;
#endif
if ( n1 > 0 )
{
zscal_kernel_8(n1, x, da_r, da_i);

View File

@ -0,0 +1,195 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
{
__vector double t0;
__vector double t1;
__vector double t2;
__vector double t3;
__vector double t4;
__vector double t5;
__asm__
(
"dcbt 0, %2 \n\t"
"xsnegdp 33, %x10 \n\t" // -alpha_i
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
"lxvp 40, 0(%2) \n\t"
"lxvp 42, 32(%2) \n\t"
"lxvp 44, 64(%2) \n\t"
"lxvp 46, 96(%2) \n\t"
"addic. %1, %1, -8 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 34, 44, 32 \n\t"
"xvmuldp 35, 45, 32 \n\t"
"xvmuldp 36, 46, 32 \n\t"
"xvmuldp 37, 47, 32 \n\t"
XXSWAPD_S(38,40)
XXSWAPD_S(39,41)
XXSWAPD_S(%x3,42)
XXSWAPD_S(%x4,43)
XXSWAPD_S(%x5,44)
XXSWAPD_S(%x6,45)
XXSWAPD_S(%x7,46)
XXSWAPD_S(%x8,47)
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp 39, 39, 33 \n\t"
"xvmuldp %x3, %x3, 33 \n\t"
"xvmuldp %x4, %x4, 33 \n\t"
"lxvp 40, 128(%2) \n\t"
"lxvp 42, 160(%2) \n\t"
"xvmuldp %x5, %x5, 33 \n\t"
"xvmuldp %x6, %x6, 33 \n\t"
"xvmuldp %x7, %x7, 33 \n\t"
"xvmuldp %x8, %x8, 33 \n\t"
"lxvp 44, 192(%2) \n\t"
"lxvp 46, 224(%2) \n\t"
"xvadddp 48, 48, 38 \n\t"
"xvadddp 49, 49, 39 \n\t"
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
"xvadddp 34, 34, %x5 \n\t"
"xvadddp 35, 35, %x6 \n\t"
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
"addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t"
"xvmuldp 50, 42, 32 \n\t"
"xvmuldp 51, 43, 32 \n\t"
"xvmuldp 34, 44, 32 \n\t"
"xvmuldp 35, 45, 32 \n\t"
"xvmuldp 36, 46, 32 \n\t"
"xvmuldp 37, 47, 32 \n\t"
XXSWAPD_S(38,40)
XXSWAPD_S(39,41)
XXSWAPD_S(%x3,42)
XXSWAPD_S(%x4,43)
XXSWAPD_S(%x5,44)
XXSWAPD_S(%x6,45)
XXSWAPD_S(%x7,46)
XXSWAPD_S(%x8,47)
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp 39, 39, 33 \n\t"
"xvmuldp %x3, %x3, 33 \n\t"
"xvmuldp %x4, %x4, 33 \n\t"
"xvmuldp %x5, %x5, 33 \n\t"
"xvmuldp %x6, %x6, 33 \n\t"
"xvmuldp %x7, %x7, 33 \n\t"
"xvmuldp %x8, %x8, 33 \n\t"
"xvadddp 48, 48, 38 \n\t"
"xvadddp 49, 49, 39 \n\t"
"xvadddp 50, 50, %x3 \n\t"
"xvadddp 51, 51, %x4 \n\t"
"stxv 49, 0(%2) \n\t"
"stxv 48, 16(%2) \n\t"
"stxv 51, 32(%2) \n\t"
"stxv 50, 48(%2) \n\t"
"xvadddp 34, 34, %x5 \n\t"
"xvadddp 35, 35, %x6 \n\t"
"xvadddp 36, 36, %x7 \n\t"
"xvadddp 37, 37, %x8 \n\t"
"stxv 35, 64(%2) \n\t"
"stxv 34, 80(%2) \n\t"
"stxv 37, 96(%2) \n\t"
"stxv 36, 112(%2) \n\t"
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
:
"+m" (*x),
"+r" (n), // 1
"+b" (x), // 2
"=wa" (t0), // 3
"=wa" (t1), // 4
"=wa" (t2), // 5
"=wa" (t3), // 6
"=wa" (t4), // 7
"=wa" (t5) // 8
:
"d" (alpha_r), // 9
"d" (alpha_i) // 10
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51"
);
}

View File

@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
#if defined(__VEC__) || defined(__ALTIVEC__)
#if defined(POWER8) || defined(POWER9)
#include "zswap_microk_power8.c"
#elif defined(POWER10)
#include "cswap_microk_power10.c"
#endif
#endif

View File

@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
SOMATCOPY_RT = omatcopy_rt.c
DOMATCOPY_RT = omatcopy_rt.c

View File

@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
SROTKERNEL = srot.c
DROTKERNEL = drot.c

View File

@ -6,7 +6,7 @@
#if defined(SKYLAKEX)
#include "dasum_microk_skylakex-2.c"
#elif defined(HASWELL)
#elif defined(HASWELL) || defined(ZEN)
#include "dasum_microk_haswell-2.c"
#endif
@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#if defined(SMP)
int nthreads;
FLOAT dummy_alpha;
FLOAT * dummy_b;
#endif
FLOAT sumf = 0.0;
@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
#else
mode = BLAS_DOUBLE | BLAS_REAL;
#endif
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
ptr = (FLOAT *)result;
for (i = 0; i < nthreads; i++) {
sumf += (*ptr);

View File

@ -2,7 +2,7 @@
#if defined(SKYLAKEX)
#include "drot_microk_skylakex-2.c"
#elif defined(HASWELL)
#elif defined(HASWELL) || defined(ZEN)
#include "drot_microk_haswell-2.c"
#endif

373
kernel/x86_64/omatcopy_rt.c Normal file
View File

@ -0,0 +1,373 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#ifdef HAVE_AVX
#define ROWS_OF_BLOCK 384
/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */
/* m: %5 = num_rows, %6 = alpha */
/* xmm15 = alpha */
#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
"vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\
"vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\
"vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\
"vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";"
#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
"vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\
"vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\
"vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\
"vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";"
#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
"vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
"vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
"vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
"vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
#define COPY_4x16 "movq %1,%4; addq $16,%1;"\
"vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\
"vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\
TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\
TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7)
#define COPY_4x8 "movq %1,%4; addq $16,%1;"\
"vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\
"vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\
TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)
#define COPY_4x4 "movq %1,%4; addq $16,%1;"\
"vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\
"vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\
TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3)
#define COPY_4x2 \
"vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
"vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\
"vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\
"vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;"
#define COPY_4x1 \
"vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
"vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
"vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;"
#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \
"vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\
"vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
"vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\
"vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;"
#define COPY_2x16 "movq %1,%4; addq $8,%1;"\
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\
"vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\
SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9)
#define COPY_2x8 "movq %1,%4; addq $8,%1;"\
"vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\
"vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\
SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5)
#define COPY_2x4 "movq %1,%4; addq $8,%1;"\
"vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\
SAVE_2x4(0,1,4,5)
#define COPY_2x2 \
"vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\
"vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;"
#define COPY_2x1 \
"vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;"
#define SAVE_1x4(c1_no)\
"vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
"vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"
#define COPY_1x16 "movq %1,%4; addq $4,%1;"\
"vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\
"vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
#define COPY_1x8 "movq %1,%4; addq $4,%1;"\
"vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;"
#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;"
#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;"
#define COMPUTE(ndim){\
src = src_base; dst = dst_base;\
__asm__ __volatile__(\
"vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\
#ndim"31:\n\t"\
COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\
#ndim"32:\n\t"\
"cmpq $2,%%r11; jb "#ndim"33f;"\
COPY_2x##ndim "subq $2,%%r11;"\
#ndim"33:\n\t"\
"testq %%r11,%%r11; jz "#ndim"34f;"\
COPY_1x##ndim "subq $1,%%r11;"\
#ndim"34:\n\t"\
:"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\
,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
}
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
float *src, *dst, *dst_tmp, *src_base, *dst_base;
uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
BLASLONG cols_left, rows_done; float ALPHA = alpha;
if(ALPHA==0.0){
dst_base = b;
for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;}
return 0;
}
for(rows_done=0;rows_done<rows;rows_done+=num_rows){
num_rows = rows-rows_done;
if(num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK;
cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done;
if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;}
for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;}
for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;}
for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;}
if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;}
}
return 0;
}
#else
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
{
BLASLONG i, j;
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
if (rows <= 0) return 0;
if (cols <= 0) return 0;
a_offset = a;
b_offset = b;
i = (rows >> 2);
if (i > 0) {
do {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset3 = a_offset2 + lda;
a_offset4 = a_offset3 + lda;
a_offset += 4 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 4;
j = (cols >> 2);
if (j > 0) {
do {
/* Column 1 of MAT_B */
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
/* Column 2 of MAT_B */
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
/* Column 3 of MAT_B */
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
/* Column 4 of MAT_B */
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
a_offset3 += 4;
a_offset4 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
} // if(j > 0)
if (cols & 2) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
a_offset3 += 2;
a_offset4 += 2;
b_offset1 += ldb*2;
}
if (cols & 1) {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
}
i--;
} while (i > 0);
}
if (rows & 2) {
a_offset1 = a_offset;
a_offset2 = a_offset1 + lda;
a_offset += 2 * lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
b_offset += 2;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
a_offset1 += 4;
a_offset2 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
a_offset1 += 2;
a_offset2 += 2;
b_offset1 += ldb*2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
}
} // if (rows & 2)
if (rows & 1) {
a_offset1 = a_offset;
a_offset += lda;
b_offset1 = b_offset;
b_offset2 = b_offset1 + ldb;
b_offset3 = b_offset2 + ldb;
b_offset4 = b_offset3 + ldb;
j = (cols >> 2);
if (j > 0){
do {
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
a_offset1 += 4;
b_offset1 += ldb * 4;
b_offset2 += ldb * 4;
b_offset3 += ldb * 4;
b_offset4 += ldb * 4;
j--;
} while (j > 0);
}
if (cols & 2){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
a_offset1 += 2;
b_offset1 += ldb * 2;
}
if (cols & 1){
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
}
}
return 0;
}
#endif

View File

@ -11,7 +11,7 @@
#if defined(SKYLAKEX)
#include "sasum_microk_skylakex-2.c"
#elif defined(HASWELL)
#elif defined(HASWELL) || defined(ZEN)
#include "sasum_microk_haswell-2.c"
#endif

View File

@ -0,0 +1,426 @@
#include "sbgemm.h"
#include <immintrin.h>
// Walk around those intrinsics that missed by compiler
#define MM256_LOADU_EPI16(addr) \
_mm256_maskz_loadu_epi16(~0, (addr))
#define MM256_STOREU_EPI16(addr, reg) \
_mm256_mask_storeu_epi16((addr), ~0, (reg))
#include <stdio.h>
void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat)
{
printf("---- BLOCK %ld x %ld ----\n", m, n);
for (BLASLONG i=0; i<m; i++) {
for (BLASLONG j=0; j<n; j++) {
printf("%-4X ", *(mat + i*n +j));
}
printf("\n");
}
printf("---- End of BLOCK ----\n");
}
void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
BLASLONG tag_k_2x = k & (~1);
__m512i array512_0, array512_1, array512_2, array512_3;
BLASLONG idx_src_base0, idx_src_base1;
BLASLONG idx_target_base0, idx_target_base1;
BLASLONG LDA_2x = 2*lda;
BLASLONG BF16_BLOCK_T_M_2x = 2*32;
idx_src_base0 = 0;
idx_src_base1 = lda;
idx_target_base0 = 0;
idx_target_base1 = 32;
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
array512_1 = _mm512_loadu_si512(&A[idx_src_base1]);
array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
idx_src_base0 += LDA_2x;
idx_src_base1 += LDA_2x;
idx_target_base0 += BF16_BLOCK_T_M_2x;
idx_target_base1 += BF16_BLOCK_T_M_2x;
}
if (tag_k_2x != k) {
__m512i ZERO512 = _mm512_setzero_si512();
array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
#endif
}
void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
BLASLONG tag_k_2x = k & (~1);
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
__m512i array512_0, array512_1, array512_2, array512_3;
BLASLONG idx_src_base0, idx_src_base1;
BLASLONG idx_target_base0, idx_target_base1;
BLASLONG LDA_2x = 2*lda;
BLASLONG BF16_BLOCK_T_M_2x = 2*32;
idx_src_base0 = 0;
idx_src_base1 = lda;
idx_target_base0 = 0;
idx_target_base1 = 32;
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
idx_src_base0 += LDA_2x;
idx_src_base1 += LDA_2x;
idx_target_base0 += BF16_BLOCK_T_M_2x;
idx_target_base1 += BF16_BLOCK_T_M_2x;
}
if (tag_k_2x != k) {
__m512i ZERO512 = _mm512_setzero_si512();
array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
#endif
}
void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
BLASLONG tag_k_2x = k & (~1);
__m256i array256_0, array256_1, array256_2, array256_3;
BLASLONG idx_src_base0, idx_src_base1;
BLASLONG idx_target_base0;
BLASLONG LDA_2x = 2*lda;
idx_src_base0 = 0;
idx_src_base1 = lda;
idx_target_base0 = 0;
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]);
array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
// Store in one row of block_B
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
idx_src_base0 += LDA_2x;
idx_src_base1 += LDA_2x;
idx_target_base0 += 32;
}
if (tag_k_2x != k) {
__m256i ZERO256 = _mm256_setzero_si256();
array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
// Store in one row of block_B
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
#endif
}
void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
{
BLASLONG tag_k_2x = k & (~1);
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
__m256i array256_0, array256_1, array256_2, array256_3;
BLASLONG idx_src_base0, idx_src_base1;
BLASLONG idx_target_base0;
BLASLONG LDA_2x = 2*lda;
idx_src_base0 = 0;
idx_src_base1 = lda;
idx_target_base0 = 0;
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
// Store in one row of block_B
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
idx_src_base0 += LDA_2x;
idx_src_base1 += LDA_2x;
idx_target_base0 += 32;
}
if (tag_k_2x != k) {
__m256i ZERO256 = _mm256_setzero_si256();
array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
// Store in one row of block_B
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
#endif
}
void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
{
BLASLONG tag_k_32x = k & (~31);
BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7;
BLASLONG idx_target_base0;
idx_src_base0 = 0;
idx_src_base1 = 1*ldb;
idx_src_base2 = 2*ldb;
idx_src_base3 = 3*ldb;
idx_src_base4 = 4*ldb;
idx_src_base5 = 5*ldb;
idx_src_base6 = 6*ldb;
idx_src_base7 = 7*ldb;
idx_target_base0 = 0;
for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k]));
idx_target_base0 += 32*8;
}
if (tag_k_32x != k) {
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x]));
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
#endif
}
void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
{
BLASLONG tag_k_32x = k & (~31);
BLASLONG tag_n_2x = n & (~1);
BLASLONG idx_src_base0;
BLASLONG idx_target_base0;
BLASLONG LDB_2x = 2*ldb;
idx_target_base0 = 0;
for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
idx_src_base0 = 0;
for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k]));
idx_src_base0 += LDB_2x;
idx_target_base0 += 64;
}
if (tag_n_2x != n) {
_mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k]));
idx_target_base0 += 32;
}
}
if (tag_k_32x != k) {
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
idx_src_base0 = 0;
for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x]));
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x]));
idx_src_base0 += LDB_2x;
idx_target_base0 += 64;
}
if (tag_n_2x != n) {
_mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x]));
}
}
#ifdef DEBUG_PROFILE
print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
#endif
}
// Scale matrix C while beta is not ZERO or ONE
void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc)
{
BLASLONG tag_n_Nx = N & (~3);
BLASLONG tag_n_Mx = M & (~15);
BLASLONG LDC4x = ldc*4;
BLASLONG idx_base_0 = 0;
BLASLONG idx_base_1 = ldc;
BLASLONG idx_base_2 = ldc*2;
BLASLONG idx_base_3 = ldc*3;
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
__m512 array_512_0, array_512_1, array_512_2, array_512_3;
__m512 BETAVECTOR = _mm512_set1_ps(beta);
if (Order == CblasColMajor) {
for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]);
array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]);
array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]);
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
_mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
_mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1);
_mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2);
_mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3);
}
if (tag_n_Mx != M) {
array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]);
array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]);
array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]);
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
_mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1);
_mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2);
_mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3);
}
idx_base_0 += LDC4x;
idx_base_1 += LDC4x;
idx_base_2 += LDC4x;
idx_base_3 += LDC4x;
}
if (tag_n_Nx != N) {
for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
_mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
}
if (tag_n_Mx != M) {
array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
}
idx_base_0 += ldc;
}
}
} else {
}
}
// Scale matrix C while beta is not ZERO or ONE
void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc)
{
BLASLONG tag_n_Nx = N & (~3);
BLASLONG tag_n_Mx = M & (~15);
BLASLONG LDC4x = ldc*4;
BLASLONG idx_base_0 = 0;
BLASLONG idx_base_1 = ldc;
BLASLONG idx_base_2 = ldc*2;
BLASLONG idx_base_3 = ldc*3;
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
__m512 ZEROVECTOR = _mm512_setzero_ps();
if (Order == CblasColMajor) {
for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
_mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
_mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR);
_mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR);
_mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR);
}
if (tag_n_Mx != M) {
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
_mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR);
_mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR);
_mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR);
}
idx_base_0 += LDC4x;
idx_base_1 += LDC4x;
idx_base_2 += LDC4x;
idx_base_3 += LDC4x;
}
if (tag_n_Nx != N) {
for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
_mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
}
if (tag_n_Mx != M) {
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
}
idx_base_0 += ldc;
}
}
} else {
}
}

View File

@ -0,0 +1,625 @@
#include "sbgemm.h"
#include "bf16_common_macros.h"
#include <immintrin.h>
#undef STORE16_COMPLETE_RESULT
#undef STORE16_MASK_COMPLETE_RESULT
#undef SBGEMM_BLOCK_KERNEL_32x8x32
#undef SBGEMM_BLOCK_KERNEL_16x8x32
#undef SBGEMM_BLOCK_KERNEL_32xNx32
#undef SBGEMM_BLOCK_KERNEL_16xNx32
#undef SBGEMM_BLOCKING_KERNEL_2
#ifndef ONE_ALPHA // ALPHA is not ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
#define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha
#define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha
#define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha
#define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha
#define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha
#else // ALPHA is ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE
#define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one
#define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one
#define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one
#define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one
#define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one
#endif
// SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_block_kernel_32x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#else // ALPHA is ONE
void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#endif
{
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
BLASLONG idxA_base = 0;
BLASLONG idxB_base = 0;
BLASLONG width = 32;
#ifndef ONE_ALPHA
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
__m512i arrayA_512_0, arrayA_512_1;
__m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
__m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7,
result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15;
__m512 result_512_tmp_0, result_512_tmp_1, result_512_tmp_2, result_512_tmp_3;
__m512i M512_EPI32_8 = _mm512_set1_epi32(8);
__m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
result_512_0 = _mm512_setzero_ps();
result_512_1 = _mm512_setzero_ps();
result_512_2 = _mm512_setzero_ps();
result_512_3 = _mm512_setzero_ps();
result_512_4 = _mm512_setzero_ps();
result_512_5 = _mm512_setzero_ps();
result_512_6 = _mm512_setzero_ps();
result_512_7 = _mm512_setzero_ps();
result_512_8 = _mm512_setzero_ps();
result_512_9 = _mm512_setzero_ps();
result_512_10 = _mm512_setzero_ps();
result_512_11 = _mm512_setzero_ps();
result_512_12 = _mm512_setzero_ps();
result_512_13 = _mm512_setzero_ps();
result_512_14 = _mm512_setzero_ps();
result_512_15 = _mm512_setzero_ps();
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
// Load B with unroll 8
idxB_base = idx_k << 3;
arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
for (BLASLONG idx = 0; idx < width;) {
// Each two rows are a group for 32-pair bf16 elements
idxA_base = idx << 5;
arrayA_512_0 = _mm512_loadu_si512(&A[idxA_base]);
arrayA_512_1 = _mm512_loadu_si512(&A[idxA_base + 32]);
result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
result_512_8 = _mm512_dpbf16_ps(result_512_8, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
result_512_9 = _mm512_dpbf16_ps(result_512_9, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
idx += 2;
// Every 4 loops we need to switch to next 128 bits of arrayB registers
if ((idx & (~7)) == idx) {
arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
}
}
}
if (m != 32) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask)
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask)
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask)
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask)
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask)
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask)
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask)
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask)
} else {
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]))
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]))
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]))
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]))
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]))
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]))
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]))
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]))
}
}
// SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#else // ALPHA is ONE
void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#endif
{
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
BLASLONG idxB_base = 0;
BLASLONG width = 32;
#ifndef ONE_ALPHA
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
__m512i arrayA_512_0;
__m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
__m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7;
result_512_0 = _mm512_setzero_ps();
result_512_1 = _mm512_setzero_ps();
result_512_2 = _mm512_setzero_ps();
result_512_3 = _mm512_setzero_ps();
result_512_4 = _mm512_setzero_ps();
result_512_5 = _mm512_setzero_ps();
result_512_6 = _mm512_setzero_ps();
result_512_7 = _mm512_setzero_ps();
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
// Load B with unroll 8
idxB_base = idx_k << 3;
arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
for (BLASLONG idx = 0; idx < width;) {
// Each two rows are a group for 32-pair bf16 elements
// Load two rows into a 512 register
arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]);
result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
idx += 2;
// Every 4 loops we need to switch to next 128 bits of arrayB registers
if ((idx & (~7)) == idx) {
arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
}
}
}
if (m != 16) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask)
result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask)
STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask)
} else {
result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0]))
STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1]))
STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2]))
STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3]))
result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4]))
STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5]))
STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6]))
STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7]))
}
}
// SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_block_kernel_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#else // ALPHA is ONE
void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#endif
{
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
BLASLONG idxA_base = 0;
BLASLONG idxB_base = 0;
BLASLONG width = 32;
#ifndef ONE_ALPHA
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
__m512i arrayA_512[2];
__m512i arrayB_512[8];
__m512 result_512[16];
__m512 result_512_tmp_0, result_512_tmp_1;
__m512i M512_EPI32_8 = _mm512_set1_epi32(8);
__m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
__m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
for (int i = 0; i < 15; i += 2) {
result_512[i] = _mm512_setzero_ps();
result_512[i+1] = _mm512_setzero_ps();
}
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
// Load B with unroll n
for (int i = 0; i < n; i ++) {
arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
idxB_base += 32;
}
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
for (BLASLONG idx = 0; idx < width;) {
// Each two rows are a group for 32-pair bf16 elements
idxA_base = idx << 5;
arrayA_512[0] = _mm512_loadu_si512(&A[idxA_base]);
arrayA_512[1] = _mm512_loadu_si512(&A[idxA_base + 32]);
for (int i = 0; i < n; i++) {
result_512[i] = _mm512_dpbf16_ps(result_512[i] , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
}
idx += 2;
// Every 4 loops we need to switch to next 128 bits of arrayB registers
if ((idx & (~7)) == idx) {
for (int i = 0; i < n; i++) {
arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
}
}
}
}
if (m != 32) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
for (int i = 0; i < n; i++) {
result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask)
}
} else {
for (int i = 0; i < n; i++) {
result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]))
}
}
}
// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#else // ALPHA is ONE
void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
#endif
{
int SHUFFLE_MAGIC_NO = 0x39;
BLASLONG tag_k_32x = k & (~31);
BLASLONG idxB_base = 0;
BLASLONG width = 32;
#ifndef ONE_ALPHA
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
__m512i arrayA_512;
__m512i arrayB_512[8];
__m512 result_512[8];
for (int i = 0; i < 8; i += 2) {
result_512[i] = _mm512_setzero_ps();
result_512[i+1] = _mm512_setzero_ps();
}
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
// Load B with unroll n
for (int i = 0; i < n; i ++) {
arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
idxB_base += 32;
}
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
for (BLASLONG idx = 0; idx < width;) {
// Each two rows are a group for 32-pair bf16 elements
// Load two rows into a 512 register
arrayA_512 = _mm512_loadu_si512(&A[idx<<4]);
for (int i = 0; i < n; i ++) {
result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
}
idx += 2;
// Every 4 loops we need to switch to next 128 bits of arrayB registers
if ((idx & (~7)) == idx) {
for (int i = 0; i < n; i++) {
arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
}
}
}
}
if (m != 16) {
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
for (int i = 0; i < n; i++) {
result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask)
}
} else {
for (int i = 0; i < n; i++) {
result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i]))
}
}
}
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
#else // ALPHA is ONE
void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
#endif
{
BLASLONG m_step, n_step, k_step, k_step_round32;
BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
BLASLONG n_from, n_to;
BLASLONG tag_n_Nx;
n_from = 0;
n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
k_step_round32 = k_step & (~31);
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
if (M >= BF16_BLOCK_THRES_M) {
while (n_from < N) {
for (BLASLONG idx_k = 0; idx_k < K;) {
// Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A);
// TODO: MT
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
// Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
}
for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A);
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
}
}
if (tag_m_Nx != M) {
m_step = M - tag_m_Nx;
if (m_step > 16) {
COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
}
} else if (m_step == 16) {
COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
}
} else {
COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
}
}
}
idx_k += k_step;
k_step = K - idx_k;
k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
k_step_round32 = k_step & (~31);
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
}
n_from = n_to;
n_to += BF16_BLOCK_THRES_N;
n_to = (n_to > N) ? N : n_to;
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
}
} else {
m_step = M - tag_m_Nx;
while (n_from < N) {
for (BLASLONG idx_k = 0; idx_k < K;) {
// Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A);
// TODO: MT
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
// Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
}
if (tag_n_Nx != n_to) {
n_step = n_to - tag_n_Nx;
COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
}
idx_k += k_step;
k_step = K - idx_k;
k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
k_step_round32 = k_step & (~31);
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
}
n_from = n_to;
n_to += BF16_BLOCK_THRES_N;
n_to = (n_to > N) ? N : n_to;
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
}
}
}
#ifndef ONE_ALPHA // ALPHA is not ONE
void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
#else // ALPHA is ONE
void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
#endif
{
bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M];
bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K];
// TODO: assume no trans for both A and B, to complement these scenarios later
if (Order == CblasColMajor) {
SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
} else {
}
}

View File

@ -1,8 +1,11 @@
/* the direct sgemm code written by Arjan van der Ven */
#if defined(SKYLAKEX) || defined (COOPERLAKE)
#include <immintrin.h>
#include "common.h"
#if defined(SKYLAKEX) || defined (COOPERLAKE)
/*
* "Direct sgemm" code. This code operates directly on the inputs and outputs
* of the sgemm call, avoiding the copies, memory realignments and threading,

View File

@ -2,7 +2,7 @@
#if defined(SKYLAKEX)
#include "srot_microk_skylakex-2.c"
#elif defined(HASWELL)
#elif defined(HASWELL) || defined(ZEN)
#include "srot_microk_haswell-2.c"
#endif
@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i = 0;
#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128)
const int vstep = v_nlanes_f32;
const int unrollx4 = n & (-vstep * 4);
const int unrollx = n & -vstep;

View File

@ -1,5 +1,4 @@
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
#if defined(HAVE_FMA3) && defined(HAVE_AVX2)
#define HAVE_SROT_KERNEL 1

View File

@ -320,12 +320,13 @@
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
$ U12, X
$ U12, X, ABI12, Y
* ..
* .. External Functions ..
COMPLEX CLADIV
LOGICAL LSAME
REAL CLANHS, SLAMCH
EXTERNAL LSAME, CLANHS, SLAMCH
EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH
* ..
* .. External Subroutines ..
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
@ -729,22 +730,34 @@
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
$ ( BSCALE*T( ILAST, ILAST ) )
ABI22 = AD22 - U12*AD21
ABI12 = AD12 - U12*AD11
*
T1 = HALF*( AD11+ABI22 )
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) +
$ AIMAG( T1-ABI22 )*AIMAG( RTDISC )
IF( TEMP.LE.ZERO ) THEN
SHIFT = T1 + RTDISC
ELSE
SHIFT = T1 - RTDISC
SHIFT = ABI22
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
TEMP = ABS1( CTEMP )
IF( CTEMP.NE.ZERO ) THEN
X = HALF*( AD11-SHIFT )
TEMP2 = ABS1( X )
TEMP = MAX( TEMP, ABS1( X ) )
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
IF( TEMP2.GT.ZERO ) THEN
IF( REAL( X / TEMP2 )*REAL( Y )+
$ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y
END IF
SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) )
END IF
ELSE
*
* Exceptional shift. Chosen for no particularly good reason.
*
ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
$ (BSCALE*T(ILAST-1,ILAST-1))
IF( ( IITER / 20 )*20.EQ.IITER .AND.
$ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
$ ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
ELSE
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
$ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
END IF
SHIFT = ESHIFT
END IF
*

View File

@ -320,12 +320,13 @@
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
$ U12, X
$ U12, X, ABI12, Y
* ..
* .. External Functions ..
COMPLEX*16 ZLADIV
LOGICAL LSAME
DOUBLE PRECISION DLAMCH, ZLANHS
EXTERNAL LSAME, DLAMCH, ZLANHS
EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS
* ..
* .. External Subroutines ..
EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL
@ -730,22 +731,34 @@
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
$ ( BSCALE*T( ILAST, ILAST ) )
ABI22 = AD22 - U12*AD21
ABI12 = AD12 - U12*AD11
*
T1 = HALF*( AD11+ABI22 )
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) +
$ DIMAG( T1-ABI22 )*DIMAG( RTDISC )
IF( TEMP.LE.ZERO ) THEN
SHIFT = T1 + RTDISC
ELSE
SHIFT = T1 - RTDISC
SHIFT = ABI22
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
TEMP = ABS1( CTEMP )
IF( CTEMP.NE.ZERO ) THEN
X = HALF*( AD11-SHIFT )
TEMP2 = ABS1( X )
TEMP = MAX( TEMP, ABS1( X ) )
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
IF( TEMP2.GT.ZERO ) THEN
IF( DBLE( X / TEMP2 )*DBLE( Y )+
$ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y
END IF
SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) )
END IF
ELSE
*
* Exceptional shift. Chosen for no particularly good reason.
*
ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
$ (BSCALE*T(ILAST-1,ILAST-1))
IF( ( IITER / 20 )*20.EQ.IITER .AND.
$ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
$ ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
ELSE
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
$ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
END IF
SHIFT = ESHIFT
END IF
*

View File

@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND)
endif()
if(WIN32)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1
"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n"
"$ErrorActionPreference = \"Stop\"\n"
"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n"
"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n"
"echo Error\n"
"exit 1\n"
"} else {\n"
"exit 0\n"
"}\n"
)
set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1")
else()
# $1 exec, $2 input, $3 output_result
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
"rm -f $3\n"
@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
"exit 0\n"
"fi\n"
)
set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh")
endif()
add_test(NAME "REAL_LAPACK_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
)
add_test(NAME "COMPLEX_LAPACK_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
)
add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
)
add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
)
add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
)
# ======== COMPLEX-COMPLEX16 LIN TESTS ========================
add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
)
# ======== SINGLE RFP LIN TESTS ========================
add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
)
# ======== COMPLEX16 RFP LIN TESTS ========================
add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
)
# ======== COMPLEX16 RFP LIN TESTS ========================
add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
)
# ======== COMPLEX16 RFP LIN TESTS ========================
add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
)
#
#
@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
#
add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
)
add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
)
add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
)
add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
)
add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
)
add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
)
add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
)
add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
)
add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
)
add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
)
add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
)
add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
)
add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
)
add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
)
add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
)
add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
)
add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
)
add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
)
add_test(NAME "SCSD:_Testing_CS_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
)
add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
)
# ======== COMPLEX EIG TESTS ===========================
add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
)
add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
)
add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
)
add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
)
add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
)
add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
)
add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
)
add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
)
add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
)
add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
)
add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
)
add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
)
add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
)
add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
)
add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
)
add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
)
add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
)
add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
)
add_test(NAME "CCSD:_Testing_CS_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
)
add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
)
# ======== DOUBLE EIG TESTS ===========================
add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
)
add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
)
add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
)
add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
)
add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
)
add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
)
add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
)
add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
)
add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
)
add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
)
add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
)
add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
)
add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
)
add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
)
add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
)
add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
)
add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
)
add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
)
add_test(NAME "DCSD:_Testing_CS_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
)
add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
)
# ======== COMPLEX16 EIG TESTS ===========================
add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
)
add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
)
add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
)
add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
)
add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
)
add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
)
add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
)
add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
)
add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
)
add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
)
add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
)
add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
)
add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
)
add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
)
add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
)
add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
)
add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
)
add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
)
add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
)
add_test(NAME "Constrained_Linear_Least_Squares_routines"
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
)

View File

@ -25,7 +25,7 @@ set(AEIGTST
set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f
ssvdch.f ssvdct.f ssxt1.f)
set(SEIGTST schkee.f
set(SEIGTST schkee.F
sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f
schkbb.f schkbd.f schkbk.f schkbl.f schkec.f
schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f
@ -42,7 +42,7 @@ set(SEIGTST schkee.f
sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
sstt22.f ssyt21.f ssyt22.f)
set(CEIGTST cchkee.f
set(CEIGTST cchkee.F
cbdt01.f cbdt02.f cbdt03.f cbdt05.f
cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f
@ -62,7 +62,7 @@ set(CEIGTST cchkee.f
set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
dsvdch.f dsvdct.f dsxt1.f)
set(DEIGTST dchkee.f
set(DEIGTST dchkee.F
dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f
dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f
dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f
@ -79,7 +79,7 @@ set(DEIGTST dchkee.f
dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
dstt22.f dsyt21.f dsyt22.f)
set(ZEIGTST zchkee.f
set(ZEIGTST zchkee.F
zbdt01.f zbdt02.f zbdt03.f zbdt05.f
zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f

View File

@ -157,11 +157,11 @@ cleanobj:
cleanexe:
rm -f xeigtst*
schkee.o: schkee.f
schkee.o: schkee.F
$(FC) $(FFLAGS_DRV) -c -o $@ $<
dchkee.o: dchkee.f
dchkee.o: dchkee.F
$(FC) $(FFLAGS_DRV) -c -o $@ $<
cchkee.o: cchkee.f
cchkee.o: cchkee.F
$(FC) $(FFLAGS_DRV) -c -o $@ $<
zchkee.o: zchkee.f
zchkee.o: zchkee.F
$(FC) $(FFLAGS_DRV) -c -o $@ $<

View File

@ -1034,6 +1034,10 @@
* =====================================================================
PROGRAM CCHKEE
*
#if defined(_OPENMP)
use omp_lib
#endif
*
* -- LAPACK test routine (version 3.7.0) --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@ -1071,7 +1075,7 @@
CHARACTER*80 LINE
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
$ NK, NN, NPARMS, NRHS, NTYPES,
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
REAL EPS, S1, S2, THRESH, THRSHN
* ..
* .. Local Arrays ..
@ -1084,12 +1088,16 @@
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
$ ISHFTS( MAXIN ), IACC22( MAXIN )
REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
$ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
$ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
$ RESULT( 500 )
COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
$ X( 5*NMAX )
* ..
* .. Allocatable Arrays ..
INTEGER AllocateStatus
REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK
COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C
* ..
* .. External Functions ..
LOGICAL LSAMEN
REAL SECOND, SLAMCH
@ -1130,6 +1138,21 @@
DATA INTSTR / '0123456789' /
DATA IOLDSD / 0, 0, 0, 1 /
* ..
* .. Allocate memory dynamically ..
*
ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
* ..
* .. Executable Statements ..
*
A = 0.0
@ -1846,8 +1869,16 @@
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
CALL XLAENV( 1, 1 )
CALL XLAENV( 9, 25 )
IF( TSTERR )
$ CALL CERRST( 'CST', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL CERRST( 'CST', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
DO 290 I = 1, NPARMS
CALL XLAENV( 1, NBVAL( I ) )
CALL XLAENV( 2, NBMIN( I ) )
@ -2305,8 +2336,16 @@
MAXTYP = 15
NTYPES = MIN( MAXTYP, NTYPES )
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
IF( TSTERR )
$ CALL CERRST( 'CHB', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL CERRST( 'CHB', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
* CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
@ -2437,6 +2476,13 @@
WRITE( NOUT, FMT = 9994 )
S2 = SECOND( )
WRITE( NOUT, FMT = 9993 )S2 - S1
*
DEALLOCATE (S, STAT = AllocateStatus)
DEALLOCATE (A, STAT = AllocateStatus)
DEALLOCATE (B, STAT = AllocateStatus)
DEALLOCATE (C, STAT = AllocateStatus)
DEALLOCATE (RWORK, STAT = AllocateStatus)
DEALLOCATE (WORK, STAT = AllocateStatus)
*
9999 FORMAT( / ' Execution not attempted due to input errors' )
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )

View File

@ -1040,6 +1040,10 @@
* =====================================================================
PROGRAM DCHKEE
*
#if defined(_OPENMP)
use omp_lib
#endif
*
* -- LAPACK test routine (version 3.7.0) --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@ -1077,7 +1081,7 @@
CHARACTER*80 LINE
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
$ NK, NN, NPARMS, NRHS, NTYPES,
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN
* ..
* .. Local Arrays ..
@ -1089,10 +1093,13 @@
$ PVAL( MAXIN )
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
$ ISHFTS( MAXIN ), IACC22( MAXIN )
DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
$ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
$ WORK( LWORK ), X( 5*NMAX )
DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
$ TAUB( NMAX ), X( 5*NMAX )
* ..
* .. Allocatable Arrays ..
INTEGER AllocateStatus
DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK
DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C
* ..
* .. External Functions ..
LOGICAL LSAMEN
@ -1134,6 +1141,17 @@
DATA INTSTR / '0123456789' /
DATA IOLDSD / 0, 0, 0, 1 /
* ..
* .. Allocate memory dynamically ..
*
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
* ..
* .. Executable Statements ..
*
A = 0.0
@ -1856,8 +1874,16 @@
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
CALL XLAENV( 1, 1 )
CALL XLAENV( 9, 25 )
IF( TSTERR )
$ CALL DERRST( 'DST', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL DERRST( 'DST', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
DO 290 I = 1, NPARMS
CALL XLAENV( 1, NBVAL( I ) )
CALL XLAENV( 2, NBMIN( I ) )
@ -2437,6 +2463,11 @@
WRITE( NOUT, FMT = 9994 )
S2 = DSECND( )
WRITE( NOUT, FMT = 9993 )S2 - S1
*
DEALLOCATE (A, STAT = AllocateStatus)
DEALLOCATE (B, STAT = AllocateStatus)
DEALLOCATE (C, STAT = AllocateStatus)
DEALLOCATE (WORK, STAT = AllocateStatus)
*
9999 FORMAT( / ' Execution not attempted due to input errors' )
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )

View File

@ -1040,6 +1040,10 @@
* =====================================================================
PROGRAM SCHKEE
*
#if defined(_OPENMP)
use omp_lib
#endif
*
* -- LAPACK test routine (version 3.7.0) --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@ -1077,7 +1081,7 @@
CHARACTER*80 LINE
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
$ NK, NN, NPARMS, NRHS, NTYPES,
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
REAL EPS, S1, S2, THRESH, THRSHN
* ..
* .. Local Arrays ..
@ -1089,10 +1093,13 @@
$ PVAL( MAXIN )
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
$ ISHFTS( MAXIN ), IACC22( MAXIN )
REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
$ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
$ WORK( LWORK ), X( 5*NMAX )
REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
$ TAUB( NMAX ), X( 5*NMAX )
* ..
* .. Allocatable Arrays ..
INTEGER AllocateStatus
REAL, DIMENSION(:), ALLOCATABLE :: WORK
REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C
* ..
* .. External Functions ..
LOGICAL LSAMEN
@ -1134,6 +1141,17 @@
DATA INTSTR / '0123456789' /
DATA IOLDSD / 0, 0, 0, 1 /
* ..
* .. Allocate memory dynamically ..
*
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
* ..
* .. Executable Statements ..
*
A = 0.0
@ -1857,8 +1875,16 @@
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
CALL XLAENV( 1, 1 )
CALL XLAENV( 9, 25 )
IF( TSTERR )
$ CALL SERRST( 'SST', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL SERRST( 'SST', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
DO 290 I = 1, NPARMS
CALL XLAENV( 1, NBVAL( I ) )
CALL XLAENV( 2, NBMIN( I ) )
@ -2440,6 +2466,11 @@
WRITE( NOUT, FMT = 9994 )
S2 = SECOND( )
WRITE( NOUT, FMT = 9993 )S2 - S1
*
DEALLOCATE (A, STAT = AllocateStatus)
DEALLOCATE (B, STAT = AllocateStatus)
DEALLOCATE (C, STAT = AllocateStatus)
DEALLOCATE (WORK, STAT = AllocateStatus)
*
9999 FORMAT( / ' Execution not attempted due to input errors' )
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )

View File

@ -1034,6 +1034,10 @@
* =====================================================================
PROGRAM ZCHKEE
*
#if defined(_OPENMP)
use omp_lib
#endif
*
* -- LAPACK test routine (version 3.7.0) --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
@ -1071,7 +1075,7 @@
CHARACTER*80 LINE
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
$ NK, NN, NPARMS, NRHS, NTYPES,
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN
* ..
* .. Local Arrays ..
@ -1084,12 +1088,16 @@
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
$ ISHFTS( MAXIN ), IACC22( MAXIN )
DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
$ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
$ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
$ RESULT( 500 )
COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
$ X( 5*NMAX )
* ..
* .. Allocatable Arrays ..
INTEGER AllocateStatus
DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK
COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C
* ..
* .. External Functions ..
LOGICAL LSAMEN
DOUBLE PRECISION DLAMCH, DSECND
@ -1130,6 +1138,21 @@
DATA INTSTR / '0123456789' /
DATA IOLDSD / 0, 0, 0, 1 /
* ..
* .. Allocate memory dynamically ..
*
ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
* ..
* .. Executable Statements ..
*
A = 0.0
@ -1846,8 +1869,16 @@
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
CALL XLAENV( 1, 1 )
CALL XLAENV( 9, 25 )
IF( TSTERR )
$ CALL ZERRST( 'ZST', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL ZERRST( 'ZST', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
DO 290 I = 1, NPARMS
CALL XLAENV( 1, NBVAL( I ) )
CALL XLAENV( 2, NBMIN( I ) )
@ -2303,8 +2334,16 @@
MAXTYP = 15
NTYPES = MIN( MAXTYP, NTYPES )
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
IF( TSTERR )
$ CALL ZERRST( 'ZHB', NOUT )
IF( TSTERR ) THEN
#if defined(_OPENMP)
N_THREADS = OMP_GET_NUM_THREADS()
CALL OMP_SET_NUM_THREADS(1)
#endif
CALL ZERRST( 'ZHB', NOUT )
#if defined(_OPENMP)
CALL OMP_SET_NUM_THREADS(N_THREADS)
#endif
END IF
* CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
@ -2435,6 +2474,13 @@
WRITE( NOUT, FMT = 9994 )
S2 = DSECND( )
WRITE( NOUT, FMT = 9993 )S2 - S1
*
DEALLOCATE (S, STAT = AllocateStatus)
DEALLOCATE (A, STAT = AllocateStatus)
DEALLOCATE (B, STAT = AllocateStatus)
DEALLOCATE (C, STAT = AllocateStatus)
DEALLOCATE (RWORK, STAT = AllocateStatus)
DEALLOCATE (WORK, STAT = AllocateStatus)
*
9999 FORMAT( / ' Execution not attempted due to input errors' )
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )

126
param.h
View File

@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef PARAM_H
#define PARAM_H
#define LONGCAST (BLASLONG)
#if defined(__BYTE_ORDER__)
#if __GNUC__ < 9
#undef LONGCAST
#define LONGCAST
#endif
#endif
#define SBGEMM_DEFAULT_UNROLL_N 4
#define SBGEMM_DEFAULT_UNROLL_M 8
#define SBGEMM_DEFAULT_UNROLL_MN 32
@ -85,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 256
#define GEMM_DEFAULT_ALIGN 0x01ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
@ -157,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
@ -237,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
@ -330,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
@ -422,7 +430,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
@ -515,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
@ -607,7 +615,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -726,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 384
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
@ -774,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 256
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
@ -821,7 +829,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 256
#define GEMM_DEFAULT_ALIGN 0x01ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_N 4
@ -890,7 +898,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#ifdef HAVE_SSE
#define SGEMM_DEFAULT_UNROLL_M 8
@ -945,7 +953,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#ifdef CORE_YONAH
#define SGEMM_DEFAULT_UNROLL_M 4
@ -1011,7 +1019,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 32
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SYMV_P 8
@ -1068,7 +1076,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 256
#endif
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SYMV_P 8
@ -1128,7 +1136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 448
#define GEMM_DEFAULT_OFFSET_B 128
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1201,7 +1209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 128
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1272,7 +1280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 128
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1344,7 +1352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 32
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1417,7 +1425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1510,7 +1518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1636,7 +1644,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SYMV_P 8
@ -1877,7 +1885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SYMV_P 8
@ -1939,7 +1947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 128
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8
@ -1993,7 +2001,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 512
#define GEMM_DEFAULT_OFFSET_B 512
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2061,7 +2069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 8192
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef PPCG4
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 1024
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 2688
#define GEMM_DEFAULT_OFFSET_B 3072
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define SGEMM_DEFAULT_UNROLL_M 4
@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER3) || defined(POWER4) || defined(POWER5)
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 2048
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 384
#define GEMM_DEFAULT_OFFSET_B 1024
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2344,7 +2352,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#if defined(__32BIT__)
#warning using BINARY32==POWER6
#define SGEMM_DEFAULT_UNROLL_M 4
@ -2397,7 +2406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SWITCH_RATIO 16
#define GEMM_PREFERED_SIZE 16
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
@ -2433,24 +2445,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 65536
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
#define SWITCH_RATIO 16
#define GEMM_PREFERED_SIZE 16
#define SGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
#define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4
#else
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 8
#endif
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 832
#define DGEMM_DEFAULT_P 320
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 384
#define CGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_P 256
#define SGEMM_DEFAULT_Q 1026
#define DGEMM_DEFAULT_Q 960
#define SGEMM_DEFAULT_Q 512
#define DGEMM_DEFAULT_Q 512
#define CGEMM_DEFAULT_Q 1026
#define ZGEMM_DEFAULT_Q 1026
@ -2480,7 +2500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 2048
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 8
@ -2512,7 +2532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 2048
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2543,7 +2563,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 8
@ -2578,7 +2598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#ifdef HAVE_MSA
#define SGEMM_DEFAULT_UNROLL_M 8
@ -2634,7 +2654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2675,7 +2695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
#ifdef HAVE_MSA
#define SGEMM_DEFAULT_UNROLL_M 8
@ -2724,7 +2744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef RISCV64_GENERIC
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
@ -2805,7 +2825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -2846,7 +2866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 2
@ -3121,7 +3141,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
@ -3162,7 +3182,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -3203,7 +3223,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
@ -3244,7 +3264,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
@ -3283,7 +3303,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
@ -3365,7 +3385,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_N 2

Some files were not shown because too many files have changed in this diff Show More