Merge pull request #3150 from xianyi/develop
Update branch from develop for 0.3.14 release
This commit is contained in:
commit
2f6d35c3d4
24
.drone.yml
24
.drone.yml
|
@ -190,3 +190,27 @@ steps:
|
|||
- make -C ctest $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C cpp_thread_test dgemm_tester
|
||||
---
|
||||
kind: pipeline
|
||||
name: arm64_gcc10
|
||||
|
||||
platform:
|
||||
os: linux
|
||||
arch: arm64
|
||||
|
||||
steps:
|
||||
- name: Build and Test
|
||||
image: ubuntu:20.04
|
||||
environment:
|
||||
CC: gcc-10
|
||||
FC: gfortran-10
|
||||
COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1'
|
||||
commands:
|
||||
- echo "MAKE_FLAGS:= $COMMON_FLAGS"
|
||||
- apt-get update -y
|
||||
- apt-get install -y make $CC gfortran-10 perl python g++
|
||||
- $CC --version
|
||||
- make QUIET_MAKE=1 $COMMON_FLAGS
|
||||
- make -C utest $COMMON_FLAGS
|
||||
- make -C test $COMMON_FLAGS
|
||||
|
||||
|
|
|
@ -44,6 +44,11 @@ jobs:
|
|||
if: github.event_name != 'pull_request'
|
||||
run: brew update || true
|
||||
|
||||
- name: unlink installed gcc to allow updating
|
||||
run: |
|
||||
brew unlink gcc@8
|
||||
brew unlink gcc@9
|
||||
|
||||
- name: Install prerequisites
|
||||
run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas
|
||||
|
||||
|
|
|
@ -89,5 +89,7 @@ build.*
|
|||
*.swp
|
||||
benchmark/*.goto
|
||||
benchmark/smallscaling
|
||||
.vscode
|
||||
CMakeCache.txt
|
||||
CMakeFiles/*
|
||||
.vscode
|
||||
|
|
|
@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
|
|||
project(OpenBLAS C ASM)
|
||||
set(OpenBLAS_MAJOR_VERSION 0)
|
||||
set(OpenBLAS_MINOR_VERSION 3)
|
||||
set(OpenBLAS_PATCH_VERSION 13)
|
||||
set(OpenBLAS_PATCH_VERSION 14)
|
||||
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
|
||||
|
||||
# Adhere to GNU filesystem layout conventions
|
||||
|
@ -14,6 +14,9 @@ include(GNUInstallDirs)
|
|||
|
||||
include(CMakePackageConfigHelpers)
|
||||
|
||||
if(MSVC AND NOT DEFINED NOFORTRAN)
|
||||
set(NOFORTRAN ON)
|
||||
endif()
|
||||
|
||||
#######
|
||||
if(MSVC)
|
||||
|
@ -229,7 +232,7 @@ if (NOT NO_CBLAS)
|
|||
add_subdirectory(utest)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC AND NOT NOFORTRAN)
|
||||
if (NOT NOFORTRAN)
|
||||
# Build test and ctest
|
||||
add_subdirectory(test)
|
||||
if(NOT NO_CBLAS)
|
||||
|
|
|
@ -1,4 +1,52 @@
|
|||
OpenBLAS ChangeLog
|
||||
====================================================================
|
||||
Version 0.3.14
|
||||
17-Mar-2021
|
||||
|
||||
common:
|
||||
* Fixed a race condition on thread shutdown in non-OpenMP builds
|
||||
* Fixed custom BUFFERSIZE option getting ignored in gmake builds
|
||||
* Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms
|
||||
* Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT
|
||||
* Improved performance of OMATCOPY_RT across all platforms
|
||||
* Changed perl scripts to use env instead of a hardcoded /usr/bin/perl
|
||||
* Fixed potential misreading of the GCC compiler version in the build scripts
|
||||
* Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477)
|
||||
* Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335)
|
||||
|
||||
RISCV:
|
||||
* Fixed compilation on RISCV (missing entry in getarch)
|
||||
|
||||
POWER:
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions
|
||||
* Added support for compilation on FreeBSD/ppc64le
|
||||
* Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL
|
||||
* Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM
|
||||
* Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10
|
||||
* Improved SCOPY and CCOPY performance on POWER10
|
||||
* Improved SGEMM and DGEMM performance on POWER10
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
|
||||
x86_64:
|
||||
* Added an optimized bfloat16 GEMM kernel for Cooperlake
|
||||
* Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus
|
||||
* Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus
|
||||
* Added support for compilation with the NAG Fortran compiler
|
||||
* Fixed recognition of the AMD AOCC compiler
|
||||
* Fixed compilation for DYNAMIC_ARCH with clang on Windows
|
||||
* Added support for running the BLAS/CBLAS tests on Windows
|
||||
* Fixed signatures of the tls callback functions for Windows x64
|
||||
* Fixed various issues with fma intrinsics support handling
|
||||
|
||||
ARM:
|
||||
* Added support for embedded Cortex M targets via a new option EMBEDDED
|
||||
|
||||
ARMV8:
|
||||
* Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf
|
||||
* Added support for the DYNAMIC_LIST option
|
||||
* Added support for compilation with the NVIDIA HPC compiler
|
||||
* Added support for compiling with the NAG Fortran compiler
|
||||
|
||||
====================================================================
|
||||
Version 0.3.13
|
||||
12-Dec-2020
|
||||
|
|
6
Makefile
6
Makefile
|
@ -59,6 +59,9 @@ endif
|
|||
@$(CC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${cverinfo}" ]; then \
|
||||
cverinfo=`$(CC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\
|
||||
else \
|
||||
echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\
|
||||
|
@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
|
|||
@$(FC) --version > /dev/null 2>&1;\
|
||||
if [ $$? -eq 0 ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '1p'`; \
|
||||
if [ -z "$${fverinfo}" ]; then \
|
||||
fverinfo=`$(FC) --version | sed -n '2p'`; \
|
||||
fi; \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\
|
||||
else \
|
||||
echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\
|
||||
|
|
|
@ -1,28 +1,38 @@
|
|||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
ifeq ($(CORE), ARMV8)
|
||||
CCOMMON_OPT += -march=armv8-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA53)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA72)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA73)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73
|
||||
endif
|
||||
endif
|
||||
|
||||
# Use a72 tunings because Neoverse-N1 is only available
|
||||
# in GCC>=9
|
||||
|
@ -30,51 +40,71 @@ ifeq ($(CORE), NEOVERSEN1)
|
|||
ifeq ($(GCCVERSIONGTEQ7), 1)
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=thunderx
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), FALKOR)
|
||||
CCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8-a -mtune=falkor
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX2T99)
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), THUNDERX3T110)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110
|
||||
endif
|
||||
else
|
||||
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), VORTEX)
|
||||
CCOMMON_OPT += -march=armv8.3-a
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.3-a
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(GCCVERSIONGTEQ9), 1)
|
||||
ifeq ($(CORE), TSV110)
|
||||
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
endif
|
||||
|
|
|
@ -10,9 +10,11 @@ USE_OPENMP = 1
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), POWER10)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math
|
||||
FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), POWER9)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
#
|
||||
|
||||
# This library's version
|
||||
VERSION = 0.3.13
|
||||
VERSION = 0.3.14
|
||||
|
||||
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
|
||||
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
|
||||
|
|
|
@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64)
|
|||
override ARCH=x86_64
|
||||
else ifeq ($(ARCH), powerpc64)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc64le)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), powerpc)
|
||||
override ARCH=power
|
||||
else ifeq ($(ARCH), i386)
|
||||
|
@ -181,7 +183,7 @@ endif
|
|||
|
||||
# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
|
||||
ifeq ($(HOSTARCH), x86_64)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC)),)
|
||||
ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),)
|
||||
GETARCH_FLAGS += -march=native
|
||||
endif
|
||||
endif
|
||||
|
@ -623,6 +625,11 @@ DYNAMIC_CORE += THUNDERX2T99
|
|||
DYNAMIC_CORE += TSV110
|
||||
DYNAMIC_CORE += EMAG8180
|
||||
DYNAMIC_CORE += THUNDERX3T110
|
||||
ifdef DYNAMIC_LIST
|
||||
override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST)
|
||||
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8
|
||||
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), mips64)
|
||||
|
@ -663,6 +670,7 @@ endif
|
|||
endif # ARCH zarch
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
DYNAMIC_CORE = POWER6
|
||||
DYNAMIC_CORE += POWER8
|
||||
ifneq ($(C_COMPILER), GCC)
|
||||
|
@ -689,6 +697,10 @@ else
|
|||
$(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.)
|
||||
endif
|
||||
endif
|
||||
else
|
||||
DYNAMIC_CORE = POWER8
|
||||
DYNAMIC_CORE += POWER9
|
||||
endif
|
||||
endif
|
||||
|
||||
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
|
||||
|
@ -847,9 +859,19 @@ endif
|
|||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20)
|
||||
PGCVERSIONGTEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \>= 20)
|
||||
PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -c 4-5` == 11)
|
||||
PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11)
|
||||
ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 110 111 011))
|
||||
NEWPGI := 1
|
||||
endif
|
||||
ifdef BINARY64
|
||||
ifeq ($(ARCH), x86_64)
|
||||
CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
|
||||
CCOMMON_OPT += -tp p7-64
|
||||
ifneq ($(NEWPGI),1)
|
||||
CCOMMON_OPT += -D__MMX__ -Mnollvm
|
||||
endif
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER8)
|
||||
|
@ -877,13 +899,25 @@ endif
|
|||
# Fortran Compiler dependent settings
|
||||
#
|
||||
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe
|
||||
ifdef INTERFACE64
|
||||
ifneq ($(INTERFACE64), 0)
|
||||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -openmp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), FLANG)
|
||||
CCOMMON_OPT += -DF_INTERFACE_FLANG
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(OSNAME), Linux)
|
||||
ifeq ($(ARCH), x86_64)
|
||||
FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
|
||||
ifeq ($(FLANG_VENDOR),AOCC)
|
||||
FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ")
|
||||
ifeq ($(FLANG_VENDOR), AMD)
|
||||
FCOMMON_OPT += -fno-unroll-loops
|
||||
endif
|
||||
endif
|
||||
|
@ -1029,18 +1063,24 @@ ifeq ($(ARCH), x86_64)
|
|||
FCOMMON_OPT += -tp p7-64
|
||||
else
|
||||
ifeq ($(ARCH), power)
|
||||
ifeq ($(CORE), POWER6)
|
||||
$(warning NVIDIA HPC compilers do not support POWER6.)
|
||||
endif
|
||||
ifeq ($(CORE), POWER8)
|
||||
FCOMMON_OPT += -tp pwr8
|
||||
endif
|
||||
ifeq ($(CORE), POWER9)
|
||||
FCOMMON_OPT += -tp pwr9
|
||||
endif
|
||||
ifeq ($(CORE), POWER10)
|
||||
$(warning NVIDIA HPC compilers do not support POWER10.)
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
else
|
||||
FCOMMON_OPT += -tp p7
|
||||
endif
|
||||
FCOMMON_OPT += -Mrecursive
|
||||
FCOMMON_OPT += -Mrecursive -Kieee
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -mp
|
||||
endif
|
||||
|
@ -1179,6 +1219,8 @@ CCOMMON_OPT += -fPIC
|
|||
endif
|
||||
ifeq ($(F_COMPILER), SUN)
|
||||
FCOMMON_OPT += -pic
|
||||
else ifeq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -PIC
|
||||
else
|
||||
FCOMMON_OPT += -fPIC
|
||||
endif
|
||||
|
@ -1256,6 +1298,10 @@ CCOMMON_OPT += -DUSE_PAPI
|
|||
EXTRALIB += -lpapi -lperfctr
|
||||
endif
|
||||
|
||||
ifdef BUFFERSIZE
|
||||
CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE)
|
||||
endif
|
||||
|
||||
ifdef DYNAMIC_THREADS
|
||||
CCOMMON_OPT += -DDYNAMIC_THREADS
|
||||
endif
|
||||
|
@ -1433,6 +1479,10 @@ LAPACK_FFLAGS := $(FFLAGS)
|
|||
LAPACK_FPFLAGS := $(FPFLAGS)
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER),NAG)
|
||||
LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS))
|
||||
endif
|
||||
|
||||
LAPACK_CFLAGS = $(CFLAGS)
|
||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||
ifdef INTERFACE64
|
||||
|
|
|
@ -10,40 +10,46 @@ endif
|
|||
|
||||
ifdef HAVE_SSE3
|
||||
CCOMMON_OPT += -msse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSSE3
|
||||
CCOMMON_OPT += -mssse3
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mssse3
|
||||
endif
|
||||
endif
|
||||
ifdef HAVE_SSE4_1
|
||||
CCOMMON_OPT += -msse4.1
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -msse4.1
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_AVX
|
||||
CCOMMON_OPT += -mavx
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifndef NO_AVX2
|
||||
ifdef HAVE_AVX2
|
||||
CCOMMON_OPT += -mavx2
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -mavx2
|
||||
endif
|
||||
endif
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
CCOMMON_OPT += -mfma
|
||||
FCOMMON_OPT += -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), SKYLAKEX)
|
||||
ifndef DYNAMIC_ARCH
|
||||
ifndef NO_AVX512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
@ -65,9 +71,11 @@ ifeq ($(C_COMPILER), GCC)
|
|||
# cooperlake support was added in 10.1
|
||||
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
FCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
|
|
@ -13,10 +13,14 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version.
|
||||
|
||||
Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>.
|
||||
|
||||
For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib:
|
||||
<https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six
|
||||
20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful.
|
||||
|
||||
## Binary Packages
|
||||
|
||||
We provide official binary packages for the following platform:
|
||||
|
@ -208,7 +212,8 @@ Please note that it is not possible to combine support for different architectur
|
|||
- **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
|
||||
- **AIX**: Supported on PPC up to POWER8
|
||||
- **Haiku**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS:
|
||||
- **SunOS**: Supported by the community. We don't actively test the library on this OS.
|
||||
- **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>.
|
||||
|
||||
## Usage
|
||||
|
||||
|
|
20
appveyor.yml
20
appveyor.yml
|
@ -30,10 +30,10 @@ environment:
|
|||
CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
|
||||
matrix:
|
||||
- COMPILER: clang-cl
|
||||
WITH_FORTRAN: yes
|
||||
WITH_FORTRAN: ON
|
||||
- COMPILER: clang-cl
|
||||
DYNAMIC_ARCH: ON
|
||||
WITH_FORTRAN: no
|
||||
WITH_FORTRAN: OFF
|
||||
- COMPILER: cl
|
||||
- COMPILER: MinGW64-gcc-7.2.0-mingw
|
||||
DYNAMIC_ARCH: OFF
|
||||
|
@ -47,12 +47,7 @@ environment:
|
|||
install:
|
||||
- if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
|
||||
- if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake
|
||||
|
||||
- if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja
|
||||
- if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang
|
||||
|
||||
- if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1
|
||||
- if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64
|
||||
- if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
|
||||
- if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
|
||||
|
@ -68,15 +63,14 @@ before_build:
|
|||
- if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
|
||||
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
|
||||
- if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
|
||||
- if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON ..
|
||||
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..
|
||||
|
||||
build_script:
|
||||
- cmake --build .
|
||||
|
||||
test_script:
|
||||
- echo Running Test
|
||||
- cd utest
|
||||
- openblas_utest
|
||||
- ctest -j2
|
||||
|
||||
|
|
|
@ -74,6 +74,9 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
struct timeval start, stop;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info_data_t info;
|
||||
uint64_t start = 0, stop = 0;
|
||||
#else
|
||||
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||
#endif
|
||||
|
@ -82,6 +85,9 @@ double getsec()
|
|||
{
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||
#elif defined(__APPLE__)
|
||||
mach_timebase_info(&info);
|
||||
return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9;
|
||||
#else
|
||||
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
|
||||
#endif
|
||||
|
@ -90,6 +96,8 @@ double getsec()
|
|||
void begin() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &start, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &start);
|
||||
#endif
|
||||
|
@ -98,6 +106,8 @@ void begin() {
|
|||
void end() {
|
||||
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
|
||||
gettimeofday( &stop, (struct timezone *)0);
|
||||
#elif defined(__APPLE__)
|
||||
stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
|
||||
#else
|
||||
clock_gettime(CLOCK_REALTIME, &stop);
|
||||
#endif
|
||||
|
|
4
c_check
4
c_check
|
@ -1,11 +1,11 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
#use File::Basename;
|
||||
# use File::Temp qw(tempfile);
|
||||
|
||||
# Checking cross compile
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
|
||||
$hostarch = `uname -m | sed -e s/i.86/x86/`;
|
||||
$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
|
||||
chop($hostarch);
|
||||
$hostarch = "x86_64" if ($hostarch eq "amd64");
|
||||
|
|
5
cblas.h
5
cblas.h
|
@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx,
|
|||
|
||||
void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
|
||||
void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
|
||||
|
||||
void cblas_srotg(float *a, float *b, float *c, float *s);
|
||||
void cblas_drotg(double *a, double *b, double *c, double *s);
|
||||
void cblas_crotg(void *a, void *b, float *c, void *s);
|
||||
void cblas_zrotg(void *a, void *b, double *c, void *s);
|
||||
|
||||
|
||||
void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
|
||||
void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
|
||||
|
|
|
@ -45,6 +45,9 @@ endif ()
|
|||
if (DYNAMIC_ARCH)
|
||||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (POWER)
|
||||
|
|
|
@ -2499,6 +2499,5 @@ foreach (Utils_FILE ${Utils_SRC})
|
|||
endforeach ()
|
||||
|
||||
set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include")
|
||||
configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY)
|
||||
include_directories(${lapacke_include_dir})
|
||||
set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
|
||||
|
|
|
@ -148,16 +148,20 @@ endif ()
|
|||
include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
|
||||
if (DEFINED TARGET)
|
||||
if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
|
||||
# if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
# elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
|
||||
# set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
|
||||
# endif()
|
||||
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
|
||||
else()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
|
||||
|
@ -233,6 +237,11 @@ if (BINARY64)
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if(EMBEDDED)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16")
|
||||
endif()
|
||||
|
||||
if (NEED_PIC)
|
||||
if (${CMAKE_C_COMPILER} STREQUAL "IBM")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large")
|
||||
|
|
|
@ -74,6 +74,9 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if ( ${CMAKE_MATCH_1} STREQUAL C_COMPILER)
|
||||
set (CMAKE_MATCH_1 CMAKE_C_COMPILER)
|
||||
endif ()
|
||||
if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}))
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
|
|
13
common.h
13
common.h
|
@ -122,7 +122,7 @@ extern "C" {
|
|||
#define ATOM GOTO_ATOM
|
||||
#undef GOTO_ATOM
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
@ -134,6 +134,9 @@ extern "C" {
|
|||
#if defined(SMP) || defined(USE_LOCKING)
|
||||
#include <pthread.h>
|
||||
#endif
|
||||
#else
|
||||
#include <time.h>
|
||||
#include <math.h>
|
||||
#endif
|
||||
|
||||
#if defined(OS_SUNOS)
|
||||
|
@ -488,10 +491,12 @@ static inline unsigned long long rpcc(void){
|
|||
struct timespec ts;
|
||||
clock_gettime(CLOCK_MONOTONIC, &ts);
|
||||
return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec;
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
struct timeval tv;
|
||||
gettimeofday(&tv,NULL);
|
||||
return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#define RPCC_DEFINED
|
||||
|
@ -521,6 +526,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
|
|||
#include "common_linux.h"
|
||||
#endif
|
||||
|
||||
#ifdef OS_EMBEDDED
|
||||
#define DTB_DEFAULT_ENTRIES 64
|
||||
#endif
|
||||
|
||||
#define MMAP_ACCESS (PROT_READ | PROT_WRITE)
|
||||
|
||||
#ifdef __NetBSD__
|
||||
|
|
|
@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define INLINE inline
|
||||
|
||||
#ifdef F_INTERFACE_FLANG
|
||||
#if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI)
|
||||
#define RETURN_BY_STACK
|
||||
#else
|
||||
#define RETURN_BY_COMPLEX
|
||||
|
|
41
cpuid_x86.c
41
cpuid_x86.c
|
@ -1418,6 +1418,15 @@ int get_cpuname(void){
|
|||
case 9:
|
||||
case 8:
|
||||
switch (model) {
|
||||
case 12: // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 14: // Kaby Lake and refreshes
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
|
@ -1436,6 +1445,15 @@ int get_cpuname(void){
|
|||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
case 7: // Rocket Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
@ -2014,6 +2032,19 @@ int get_coretype(void){
|
|||
#endif
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
case 7:// Rocket Lake
|
||||
#ifndef NO_AVX512
|
||||
if(support_avx512())
|
||||
return CORE_SKYLAKEX;
|
||||
#endif
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
#endif
|
||||
if(support_avx())
|
||||
return CORE_SANDYBRIDGE;
|
||||
else
|
||||
return CORE_NEHALEM;
|
||||
}
|
||||
case 5:
|
||||
switch (model) {
|
||||
|
@ -2102,6 +2133,16 @@ int get_coretype(void){
|
|||
break;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if(support_avx512())
|
||||
return CPUTYPE_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
return CPUTYPE_SANDYBRIDGE;
|
||||
else
|
||||
return CPUTYPE_NEHALEM;
|
||||
}
|
||||
if (model == 14) { // Kaby Lake
|
||||
if(support_avx())
|
||||
#ifndef NO_AVX2
|
||||
|
|
|
@ -5,9 +5,18 @@ enable_language(Fortran)
|
|||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS")
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1
|
||||
"$ErrorActionPreference = \"Stop\"\n"
|
||||
"Get-Content $args[1] | & $args[0]\n"
|
||||
)
|
||||
set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1")
|
||||
else()
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh
|
||||
"$1 < $2\n"
|
||||
)
|
||||
set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh")
|
||||
endif()
|
||||
|
||||
foreach(float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char_upper)
|
||||
|
@ -21,7 +30,7 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
c_${float_char}blas1.c)
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat1"
|
||||
COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1")
|
||||
COMMAND $<TARGET_FILE:x${float_char}cblat1>)
|
||||
|
||||
#level2
|
||||
add_executable(x${float_char}cblat2
|
||||
|
@ -33,7 +42,7 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat2"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2")
|
||||
|
||||
#level3
|
||||
add_executable(x${float_char}cblat3
|
||||
|
@ -45,6 +54,6 @@ foreach(float_type ${FLOAT_TYPES})
|
|||
constant.c)
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
add_test(NAME "x${float_char}cblat3"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3")
|
||||
|
||||
endforeach()
|
||||
|
|
|
@ -212,6 +212,9 @@ ifeq ($(C_COMPILER), CLANG)
|
|||
CEXTRALIB = -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
CEXTRALIB = -lgomp
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
|
|
@ -1024,38 +1024,39 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
|
||||
int i;
|
||||
|
||||
if (!blas_server_avail) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
if (blas_server_avail) {
|
||||
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
|
||||
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
pthread_mutex_lock (&thread_status[i].lock);
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
pthread_cond_signal (&thread_status[i].wakeup);
|
||||
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
pthread_mutex_unlock(&thread_status[i].lock);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_join(blas_threads[i], NULL);
|
||||
}
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
pthread_mutex_destroy(&thread_status[i].lock);
|
||||
pthread_cond_destroy (&thread_status[i].wakeup);
|
||||
}
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_destory(&attr);
|
||||
pthread_attr_destroy(&attr);
|
||||
#endif
|
||||
|
||||
blas_server_avail = 0;
|
||||
blas_server_avail = 0;
|
||||
|
||||
}
|
||||
UNLOCK_COMMAND(&server_lock);
|
||||
|
||||
return 0;
|
||||
|
|
|
@ -644,6 +644,21 @@ static gotoblas_t *get_coretype(void){
|
|||
return NULL;
|
||||
case 9:
|
||||
case 8:
|
||||
if (model == 12) { // Tiger Lake
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2()){
|
||||
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
|
||||
return &gotoblas_HASWELL;
|
||||
}
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM;
|
||||
}
|
||||
}
|
||||
if (model == 14 ) { // Kaby Lake, Coffee Lake
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
|
@ -656,7 +671,7 @@ static gotoblas_t *get_coretype(void){
|
|||
}
|
||||
}
|
||||
case 10:
|
||||
if (model == 5 || model == 6) {
|
||||
if (model == 5 || model == 6) {
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
|
@ -666,7 +681,20 @@ static gotoblas_t *get_coretype(void){
|
|||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
}
|
||||
if (model == 7) {
|
||||
if (support_avx512())
|
||||
return &gotoblas_SKYLAKEX;
|
||||
if(support_avx2())
|
||||
return &gotoblas_HASWELL;
|
||||
if(support_avx()) {
|
||||
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
|
||||
return &gotoblas_SANDYBRIDGE;
|
||||
} else {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
|
||||
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
case 0xf:
|
||||
|
|
|
@ -43,6 +43,63 @@
|
|||
#endif
|
||||
|
||||
extern gotoblas_t gotoblas_ARMV8;
|
||||
#ifdef DYNAMIC_LIST
|
||||
#ifdef DYN_CORTEXA53
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
#else
|
||||
#define gotoblas_CORTEXA53 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA57
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
#else
|
||||
#define gotoblas_CORTEXA57 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA72
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
#else
|
||||
#define gotoblas_CORTEXA72 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEXA73
|
||||
extern gotoblas_t gotoblas_CORTEXA73;
|
||||
#else
|
||||
#define gotoblas_CORTEXA73 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_FALKOR
|
||||
extern gotoblas_t gotoblas_FALKOR;
|
||||
#else
|
||||
#define gotoblas_FALKOR gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_TSV110
|
||||
extern gotoblas_t gotoblas_TSV110;
|
||||
#else
|
||||
#define gotoblas_TSV110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX
|
||||
extern gotoblas_t gotoblas_THUNDERX;
|
||||
#else
|
||||
#define gotoblas_THUNDERX gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX2T99
|
||||
extern gotoblas_t gotoblas_THUNDERX2T99;
|
||||
#else
|
||||
#define gotoblas_THUNDERX2T99 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_THUNDERX3T110
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#else
|
||||
#define gotoblas_THUNDERX3T110 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_EMAG8180
|
||||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
#else
|
||||
#define gotoblas_EMAG8180 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_NEOVERSEN1
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEN1 gotoblas_ARMV8
|
||||
#endif
|
||||
#else
|
||||
extern gotoblas_t gotoblas_CORTEXA53;
|
||||
extern gotoblas_t gotoblas_CORTEXA57;
|
||||
extern gotoblas_t gotoblas_CORTEXA72;
|
||||
|
@ -54,6 +111,7 @@ extern gotoblas_t gotoblas_TSV110;
|
|||
extern gotoblas_t gotoblas_EMAG8180;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN1;
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
|
@ -68,7 +126,7 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__("mrs %0, "#id : "=r" (var)); \
|
||||
__asm__ ("mrs %0, "#id : "=r" (var)); \
|
||||
})
|
||||
|
||||
static char *corename[] = {
|
||||
|
|
|
@ -27,7 +27,9 @@ static char *corename[] = {
|
|||
#define NUM_CORETYPES 4
|
||||
|
||||
char *gotoblas_corename(void) {
|
||||
#ifndef C_PGI
|
||||
if (gotoblas == &gotoblas_POWER6) return corename[1];
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_POWER8) return corename[2];
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
|
@ -38,10 +40,164 @@ char *gotoblas_corename(void) {
|
|||
return corename[0];
|
||||
}
|
||||
|
||||
#if defined(__clang__)
|
||||
static int __builtin_cpu_supports(char* arg)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(C_PGI) || defined(__clang__)
|
||||
/*
|
||||
* NV HPC compilers do not yet implement __builtin_cpu_is().
|
||||
* Fake a version here for use in the CPU detection code below.
|
||||
*
|
||||
* Strategy here is to first check the CPU to see what it actually is,
|
||||
* and then test the input to see if what the CPU actually is matches
|
||||
* what was requested.
|
||||
*/
|
||||
|
||||
#include <string.h>
|
||||
|
||||
/*
|
||||
* Define POWER processor version table.
|
||||
*
|
||||
* NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time
|
||||
*/
|
||||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_POWER5 5
|
||||
#define CPU_POWER6 6
|
||||
#define CPU_POWER8 8
|
||||
#define CPU_POWER9 9
|
||||
#define CPU_POWER10 10
|
||||
|
||||
static struct {
|
||||
uint32_t pvr_mask;
|
||||
uint32_t pvr_value;
|
||||
const char* cpu_name;
|
||||
uint32_t cpu_type;
|
||||
} pvrPOWER [] = {
|
||||
|
||||
{ /* POWER6 in P5+ mode; 2.04-compliant processor */
|
||||
.pvr_mask = 0xffffffff,
|
||||
.pvr_value = 0x0f000001,
|
||||
.cpu_name = "POWER5+",
|
||||
.cpu_type = CPU_POWER5,
|
||||
},
|
||||
|
||||
{ /* Power6 aka POWER6X*/
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003e0000,
|
||||
.cpu_name = "POWER6 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x003f0000,
|
||||
.cpu_name = "POWER7 (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power7+ */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004A0000,
|
||||
.cpu_name = "POWER7+ (raw)",
|
||||
.cpu_type = CPU_POWER6,
|
||||
},
|
||||
|
||||
{ /* Power8E */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004b0000,
|
||||
.cpu_name = "POWER8E (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8NVL */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004c0000,
|
||||
.cpu_name = "POWER8NVL (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power8 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004d0000,
|
||||
.cpu_name = "POWER8 (raw)",
|
||||
.cpu_type = CPU_POWER8,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.0 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0200,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD 2.1 */
|
||||
.pvr_mask = 0xffffefff,
|
||||
.pvr_value = 0x004e0201,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power9 DD2.2 or later */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x004e0000,
|
||||
.cpu_name = "POWER9 (raw)",
|
||||
.cpu_type = CPU_POWER9,
|
||||
},
|
||||
|
||||
{ /* Power10 */
|
||||
.pvr_mask = 0xffff0000,
|
||||
.pvr_value = 0x00800000,
|
||||
.cpu_name = "POWER10 (raw)",
|
||||
.cpu_type = CPU_POWER10,
|
||||
},
|
||||
|
||||
{ /* End of table, pvr_mask and pvr_value must be zero */
|
||||
.pvr_mask = 0x0,
|
||||
.pvr_value = 0x0,
|
||||
.cpu_name = "Unknown",
|
||||
.cpu_type = CPU_UNKNOWN,
|
||||
},
|
||||
};
|
||||
|
||||
static int __builtin_cpu_is(const char *cpu) {
|
||||
int i;
|
||||
uint32_t pvr;
|
||||
uint32_t cpu_type;
|
||||
|
||||
asm("mfpvr %0" : "=r"(pvr));
|
||||
|
||||
for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) {
|
||||
if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(DEBUG)
|
||||
printf("%s: returning CPU=%s, cpu_type=%p\n", __func__,
|
||||
pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type);
|
||||
#endif
|
||||
cpu_type = pvrPOWER[i].cpu_type;
|
||||
|
||||
if (!strcmp(cpu, "power8"))
|
||||
return cpu_type == CPU_POWER8;
|
||||
if (!strcmp(cpu, "power9"))
|
||||
return cpu_type == CPU_POWER9;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* C_PGI */
|
||||
|
||||
static gotoblas_t *get_coretype(void) {
|
||||
|
||||
#ifndef C_PGI
|
||||
if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x"))
|
||||
return &gotoblas_POWER6;
|
||||
#endif
|
||||
if (__builtin_cpu_is("power8"))
|
||||
return &gotoblas_POWER8;
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
|
@ -53,7 +209,7 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_POWER10;
|
||||
#endif
|
||||
/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2)
|
||||
if (__builtin_cpu_is("power10"))
|
||||
return &gotoblas_POWER9;
|
||||
#endif
|
||||
|
@ -77,7 +233,9 @@ static gotoblas_t *force_coretype(char * coretype) {
|
|||
|
||||
switch (found)
|
||||
{
|
||||
#ifndef C_PGI
|
||||
case 1: return (&gotoblas_POWER6);
|
||||
#endif
|
||||
case 2: return (&gotoblas_POWER8);
|
||||
#if (!defined __GNUC__) || ( __GNUC__ >= 6)
|
||||
case 3: return (&gotoblas_POWER9);
|
||||
|
|
|
@ -222,11 +222,11 @@ int get_num_procs(void);
|
|||
#else
|
||||
int get_num_procs(void) {
|
||||
static int nums = 0;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
cpu_set_t cpuset,*cpusetp;
|
||||
size_t size;
|
||||
int ret;
|
||||
|
||||
#if defined(__GLIBC_PREREQ)
|
||||
#if !__GLIBC_PREREQ(2, 7)
|
||||
int i;
|
||||
#if !__GLIBC_PREREQ(2, 6)
|
||||
|
@ -1241,7 +1241,7 @@ UNLOCK_COMMAND(&alloc_lock);
|
|||
|
||||
func = &memoryalloc[0];
|
||||
|
||||
while ((func != NULL) && (map_address == (void *) -1)) {
|
||||
while ((*func != NULL) && (map_address == (void *) -1)) {
|
||||
|
||||
map_address = (*func)((void *)base_address);
|
||||
|
||||
|
@ -1619,10 +1619,12 @@ static int on_process_term(void)
|
|||
#else
|
||||
#pragma data_seg(".CRT$XLB")
|
||||
#endif
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
|
||||
|
@ -1631,10 +1633,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI
|
|||
#else
|
||||
#pragma data_seg(".CRT$XTU")
|
||||
#endif
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
|
||||
#ifdef _WIN64
|
||||
static const int(*p_process_term)(void) = on_process_term;
|
||||
#pragma const_seg()
|
||||
#else
|
||||
static int(*p_process_term)(void) = on_process_term;
|
||||
#pragma data_seg()
|
||||
#endif
|
||||
#endif
|
||||
|
@ -1668,16 +1672,23 @@ void gotoblas_dummy_for_PGI(void) {
|
|||
#ifndef MEM_LARGE_PAGES
|
||||
#define MEM_LARGE_PAGES 0x20000000
|
||||
#endif
|
||||
#else
|
||||
#elif !defined(OS_EMBEDDED)
|
||||
#define ALLOC_MMAP
|
||||
#define ALLOC_MALLOC
|
||||
#else
|
||||
#define ALLOC_MALLOC
|
||||
|
||||
inline int puts(const char *str) { return 0; }
|
||||
inline int printf(const char *format, ...) { return 0; }
|
||||
inline char *getenv(const char *name) { return ""; }
|
||||
inline int atoi(const char *str) { return 0; }
|
||||
#endif
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
|
||||
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
|
||||
#include <sys/mman.h>
|
||||
#ifndef NO_SYSV_IPC
|
||||
#include <sys/shm.h>
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
# Changelog
|
||||
# 2017/09/03 staticfloat
|
||||
|
|
52
f_check
52
f_check
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
|
||||
|
||||
|
@ -32,9 +32,9 @@ if ($compiler eq "") {
|
|||
"xlf95", "xlf90", "xlf",
|
||||
"ppuf77", "ppuf95", "ppuf90", "ppuxlf",
|
||||
"pathf90", "pathf95",
|
||||
"pgf95", "pgf90", "pgf77",
|
||||
"pgf95", "pgf90", "pgf77", "pgfortran", "nvfortran",
|
||||
"flang", "egfortran",
|
||||
"ifort");
|
||||
"ifort", "nagfor");
|
||||
|
||||
OUTER:
|
||||
foreach $lists (@lists) {
|
||||
|
@ -64,7 +64,9 @@ if ($compiler eq "") {
|
|||
if (!$?) {
|
||||
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.s && rm -f ftest.s`;
|
||||
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest.f > /dev/null 2>&1 && cat ftest.c && rm -f ftest.c`;
|
||||
}
|
||||
if ($data =~ /zhoge_/) {
|
||||
$bu = "_";
|
||||
}
|
||||
|
@ -76,6 +78,7 @@ if ($compiler eq "") {
|
|||
|
||||
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
|
||||
|
||||
$data =~ s/\(+.*?\)+//g;
|
||||
$data =~ /(\d+)\.(\d+).(\d+)/;
|
||||
$major = $1;
|
||||
$minor = $2;
|
||||
|
@ -87,7 +90,7 @@ if ($compiler eq "") {
|
|||
if ($compiler =~ /flang/) {
|
||||
$vendor = FLANG;
|
||||
$openmp = "-fopenmp";
|
||||
} elsif ($compiler =~ /pgf/) {
|
||||
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
} else {
|
||||
|
@ -123,7 +126,7 @@ if ($compiler eq "") {
|
|||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($data =~ /PGF/) {
|
||||
if ($data =~ /PGF/ || $data =~ /NVF/) {
|
||||
$vendor = PGI;
|
||||
$openmp = "-mp";
|
||||
}
|
||||
|
@ -133,8 +136,16 @@ if ($compiler eq "") {
|
|||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($data =~ /NAG/) {
|
||||
$vendor = NAG;
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
# for embedded underscore name, e.g. zho_ge, it may append 2 underscores.
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`;
|
||||
if ($data eq "") {
|
||||
$data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.c && rm -f ftest3.c`;
|
||||
}
|
||||
if ($data =~ / zho_ge__/) {
|
||||
$need2bu = 1;
|
||||
}
|
||||
|
@ -177,7 +188,7 @@ if ($compiler eq "") {
|
|||
$openmp = "-mp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /pgf/) {
|
||||
if ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
|
||||
$vendor = PGI;
|
||||
$bu = "_";
|
||||
$openmp = "-mp";
|
||||
|
@ -222,6 +233,12 @@ if ($compiler eq "") {
|
|||
$openmp = "-fopenmp";
|
||||
}
|
||||
|
||||
if ($compiler =~ /nagfor/) {
|
||||
$vendor = NAG;
|
||||
$bu = "_";
|
||||
$openmp = "-openmp";
|
||||
}
|
||||
|
||||
if ($vendor eq "") {
|
||||
$nofortran = 1;
|
||||
$compiler = "gfortran";
|
||||
|
@ -275,14 +292,20 @@ if (!$?) {
|
|||
if ($?) {
|
||||
$link = `$compiler $openmp -mabi=64 -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
#For nagfor
|
||||
if ($?) {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$binary = "" if ($?);
|
||||
}
|
||||
|
||||
if ($binary eq "") {
|
||||
$link = `$compiler $openmp -v ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
}
|
||||
|
||||
if ( $vendor eq "NAG") {
|
||||
$link = `$compiler $openmp -dryrun ftest2.f 2>&1 && rm -f a.out a.exe`;
|
||||
}
|
||||
$linker_L = "";
|
||||
$linker_l = "";
|
||||
$linker_a = "";
|
||||
|
@ -330,12 +353,13 @@ if ($link ne "") {
|
|||
$flags =~ s/\@/\,/g;
|
||||
$linker_L .= "-Wl,". $flags . " " ;
|
||||
}
|
||||
if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
|
||||
if ($flags =~ /-lgomp/ && $ENV{"CC"} =~ /clang/) {
|
||||
$flags = "-lomp";
|
||||
}
|
||||
|
||||
if (
|
||||
($flags =~ /^\-l/)
|
||||
&& ($flags !~ /ibrary/)
|
||||
&& ($flags !~ /gfortranbegin/)
|
||||
&& ($flags !~ /frtbegin/)
|
||||
&& ($flags !~ /pathfstart/)
|
||||
|
@ -352,6 +376,16 @@ if ($link ne "") {
|
|||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
if ( $flags =~ /quickfit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /safefit.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
if ( $flags =~ /thsafe.o/ && $vendor == NAG) {
|
||||
$linker_l .= $flags . " ";
|
||||
}
|
||||
|
||||
$linker_a .= $flags . " " if $flags =~ /\.a$/;
|
||||
}
|
||||
|
||||
|
|
|
@ -1375,6 +1375,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef __riscv
|
||||
#include "cpuid_riscv64.c"
|
||||
#define OPENBLAS_SUPPORTED
|
||||
#endif
|
||||
|
||||
#ifdef __arm__
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
#else
|
||||
#include "config_kernel.h"
|
||||
#endif
|
||||
#include "param.h"
|
||||
#include "common.h"
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
|
||||
|
|
|
@ -316,7 +316,7 @@ CCBLAS1OBJS = \
|
|||
cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
|
||||
cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
|
||||
cblas_caxpby.$(SUFFIX) \
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX)
|
||||
cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)
|
||||
|
||||
CCBLAS2OBJS = \
|
||||
cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \
|
||||
|
@ -346,7 +346,7 @@ CZBLAS1OBJS = \
|
|||
cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
|
||||
cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
|
||||
cblas_zaxpby.$(SUFFIX) \
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX)
|
||||
cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)
|
||||
|
||||
|
||||
CZBLAS2OBJS = \
|
||||
|
@ -1634,6 +1634,12 @@ cblas_srotg.$(SUFFIX) cblas_srotg.$(PSUFFIX): rotg.c
|
|||
cblas_drotg.$(SUFFIX) cblas_drotg.$(PSUFFIX): rotg.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_crotg.$(SUFFIX) cblas_crotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_zrotg.$(SUFFIX) cblas_zrotg.$(PSUFFIX): zrotg.c
|
||||
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
|
||||
|
||||
cblas_srotm.$(SUFFIX) cblas_srotm.$(PSUFFIX): rotm.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
|
@ -1664,6 +1670,12 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
|
|||
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
|
||||
|
||||
cblas_csrot.$(SUFFIX) cblas_csrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
cblas_zdrot.$(SUFFIX) cblas_zdrot.$(PSUFFIX) : zrot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/perl
|
||||
#!/usr/bin/env perl
|
||||
|
||||
$count = 0;
|
||||
|
||||
|
|
|
@ -246,6 +246,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
|
||||
#ifdef SMP
|
||||
double MNK;
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
|
@ -264,6 +265,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
#endif
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY) && !defined(USE_SIMPLE_THREADED_LEVEL3)
|
||||
int nodes;
|
||||
|
@ -417,8 +419,10 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
|||
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
|
||||
#ifdef SMP
|
||||
#if defined(USE_SIMPLE_THREADED_LEVEL3) || !defined(NO_AFFINITY)
|
||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||
#endif
|
||||
|
||||
MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||
|
|
|
@ -107,7 +107,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
|||
dq1 = dp1 * *dx1;
|
||||
if(ABS(dq1) > ABS(dq2))
|
||||
{
|
||||
dflag = ZERO;
|
||||
dh11 = ONE;
|
||||
dh22 = ONE;
|
||||
dh21 = - dy1 / *dx1;
|
||||
|
|
|
@ -187,10 +187,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
endif ()
|
||||
# Makefile.L3
|
||||
set(USE_TRMM false)
|
||||
if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE))
|
||||
string(TOUPPER ${TARGET_CORE} UC_TARGET_CORE)
|
||||
if (ARM OR ARM64 OR (UC_TARGET_CORE MATCHES LONGSOON3B) OR (UC_TARGET_CORE MATCHES GENERIC) OR (UC_TARGET_CORE MATCHES HASWELL) OR (UC_TARGET_CORE MATCHES ZEN) OR (UC_TARGET_CORE MATCHES SKYLAKEX) OR (UC_TARGET_CORE MATCHES COOPERLAKE))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10))
|
||||
if (ZARCH OR (UC_TARGET_CORE MATCHES POWER8) OR (UC_TARGET_CORE MATCHES POWER9) OR (UC_TARGET_CORE MATCHES POWER10))
|
||||
set(USE_TRMM true)
|
||||
endif ()
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
|||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
|
@ -47,7 +47,7 @@ ifeq ($(TARGET_CORE), COOPERLAKE)
|
|||
endif
|
||||
endif
|
||||
else ifeq ($(TARGET_CORE), SKYLAKEX)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 -mavx512f
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
override CFLAGS += -fno-asynchronous-unwind-tables
|
||||
endif
|
||||
|
|
|
@ -1,3 +1,11 @@
|
|||
FMAFLAG=
|
||||
ifndef OLDGCC
|
||||
ifdef HAVE_FMA3
|
||||
FMAFLAG = -mfma
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
### AMAX ###
|
||||
|
||||
ifndef SAMAXKERNEL
|
||||
|
@ -828,10 +836,10 @@ $(KDIR)xnrm2_k$(TSUFFIX).$(SUFFIX) $(KDIR)xnrm2_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(CC) $(CFLAGS) -DCOMPLEX -c -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)srot_k$(TSUFFIX).$(SUFFIX) $(KDIR)srot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2013, The OpenBLAS Project
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
|
@ -27,36 +27,208 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
/*****************************************************
|
||||
* 2014/06/09 Saar
|
||||
*
|
||||
* Order rowMajor
|
||||
* Trans
|
||||
*
|
||||
******************************************************/
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i,j;
|
||||
FLOAT *aptr,*bptr;
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
|
||||
|
||||
if ( rows <= 0 ) return(0);
|
||||
if ( cols <= 0 ) return(0);
|
||||
if (rows <= 0) return 0;
|
||||
if (cols <= 0) return 0;
|
||||
|
||||
aptr = a;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for ( i=0; i<rows ; i++ )
|
||||
{
|
||||
bptr = &b[i];
|
||||
for(j=0; j<cols; j++)
|
||||
{
|
||||
bptr[j*ldb] = alpha * aptr[j];
|
||||
}
|
||||
aptr += lda;
|
||||
}
|
||||
i = (rows >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
return(0);
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 4;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0) {
|
||||
do {
|
||||
/* Column 1 of MAT_B */
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
/* Column 2 of MAT_B */
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
/* Column 3 of MAT_B */
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
|
||||
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
|
||||
|
||||
/* Column 4 of MAT_B */
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
|
||||
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
a_offset3 += 4;
|
||||
a_offset4 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
} // if(j > 0)
|
||||
|
||||
|
||||
if (cols & 2) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
a_offset3 += 2;
|
||||
a_offset4 += 2;
|
||||
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
if (cols & 1) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
}
|
||||
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
|
||||
if (rows & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 2;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
}
|
||||
} // if (rows & 2)
|
||||
|
||||
|
||||
if (rows & 1) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset += lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
b_offset1 += ldb * 2;
|
||||
}
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
dot[0]=0.0;
|
||||
dot[1]=0.0;
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = 0.0 ;
|
||||
CIMAG(result) = 0.0 ;
|
||||
#else
|
||||
|
@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
i++ ;
|
||||
|
||||
}
|
||||
#if !defined(__PPC__) && !defined(__SunOS)
|
||||
#if !defined(__PPC__) && !defined(__SunOS) && !defined(__PGI)
|
||||
CREAL(result) = dot[0];
|
||||
CIMAG(result) = dot[1];
|
||||
#else
|
||||
|
|
|
@ -97,9 +97,18 @@ CNRM2KERNEL = znrm2.S
|
|||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
|
|
|
@ -96,11 +96,20 @@ DNRM2KERNEL = nrm2.S
|
|||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
DSDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
SGEMM_BETA = sgemm_beta.S
|
||||
|
|
|
@ -70,10 +70,19 @@ DCOPYKERNEL = copy.S
|
|||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
|
|
@ -47,8 +47,13 @@ ZCOPYKERNEL = copy.S
|
|||
|
||||
SDOTKERNEL = dot_thunderx.c
|
||||
DDOTKERNEL = ddot_thunderx.c
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
|
|
@ -72,8 +72,13 @@ ZCOPYKERNEL = copy.S
|
|||
|
||||
SDOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
|
|
|
@ -58,6 +58,7 @@ extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n
|
|||
#define CUR_MAXINV "d8"
|
||||
#define CUR_MAXINV_V "v8.2d"
|
||||
#define CUR_MAX_V "v8.2d"
|
||||
#define REGINF "d9"
|
||||
|
||||
static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
||||
double *ssq, double *scale)
|
||||
|
@ -79,8 +80,10 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" ble 9f //nrm2_kernel_L999 \n"
|
||||
|
||||
"1: //nrm2_kernel_F_BEGIN: \n"
|
||||
" mov x6, #0x7FF0000000000000 //+Infinity \n"
|
||||
" fmov "REGZERO", xzr \n"
|
||||
" fmov "REGONE", #1.0 \n"
|
||||
" fmov "REGINF", x6 \n"
|
||||
" lsl "INC_X", "INC_X", #"INC_SHIFT" \n"
|
||||
" mov "J", "N" \n"
|
||||
" cmp "J", xzr \n"
|
||||
|
@ -104,6 +107,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" ldr d4, ["X"] \n"
|
||||
" fabs d4, d4 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
|
@ -116,6 +121,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" ldr d3, ["X", #8] \n"
|
||||
" fabs d3, d3 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d3 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
|
@ -158,6 +165,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" fmaxp v24.2d, v24.2d, v26.2d \n"
|
||||
" fmaxp v24.2d, v24.2d, v24.2d \n"
|
||||
" fmax "CUR_MAX", "SCALE", d24 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
|
||||
" //dup "CUR_MAX_V", v7.d[0] \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
|
@ -217,6 +226,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" fmaxp v24.2d, v24.2d, v26.2d \n"
|
||||
" fmaxp v24.2d, v24.2d, v24.2d \n"
|
||||
" fmax "CUR_MAX", "SCALE", d24 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "CUR_MAXINV", "REGONE", "CUR_MAX" \n"
|
||||
" //dup "CUR_MAX_V", v7.d[0] \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
|
@ -265,6 +276,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" ldr d4, ["X"] \n"
|
||||
" fabs d4, d4 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d4 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
|
@ -276,6 +289,8 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
" ldr d3, ["X", #8] \n"
|
||||
" fabs d3, d3 \n"
|
||||
" fmax "CUR_MAX", "SCALE", d3 \n"
|
||||
" fcmp "CUR_MAX", "REGINF" \n"
|
||||
" beq 10f \n"
|
||||
" fdiv "SCALE", "SCALE", "CUR_MAX" \n"
|
||||
" fmul "SCALE", "SCALE", "SCALE" \n"
|
||||
" fmul "SSQ", "SSQ", "SCALE" \n"
|
||||
|
@ -291,6 +306,11 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
"9: //nrm2_kernel_L999: \n"
|
||||
" str "SSQ", [%[SSQ_]] \n"
|
||||
" str "SCALE", [%[SCALE_]] \n"
|
||||
" b 11f \n"
|
||||
"10: \n"
|
||||
" str "REGINF", [%[SSQ_]] \n"
|
||||
" str "REGINF", [%[SCALE_]] \n"
|
||||
"11: \n"
|
||||
|
||||
:
|
||||
: [SSQ_] "r" (ssq), //%0
|
||||
|
@ -300,7 +320,7 @@ static void nrm2_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x,
|
|||
[INCX_] "r" (inc_x) //%4
|
||||
: "cc",
|
||||
"memory",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5",
|
||||
"x0", "x1", "x2", "x3", "x4", "x5", "x6",
|
||||
"d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8"
|
||||
);
|
||||
|
||||
|
@ -359,6 +379,12 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
cur_ssq = *ptr;
|
||||
cur_scale = *(ptr + 1);
|
||||
|
||||
if (cur_ssq == INFINITY) {
|
||||
ssq = INFINITY;
|
||||
scale = INFINITY;
|
||||
break;
|
||||
}
|
||||
|
||||
if (cur_scale != 0) {
|
||||
if (cur_scale > scale) {
|
||||
scale = (scale / cur_scale);
|
||||
|
|
|
@ -154,11 +154,7 @@ ZCOPYKERNEL = zcopy_power10.c
|
|||
SDOTKERNEL = sdot_power10.c
|
||||
DDOTKERNEL = ddot_power10.c
|
||||
DSDOTKERNEL = sdot_power10.c
|
||||
ifneq ($(GCCVERSIONGTEQ9),1)
|
||||
CDOTKERNEL = cdot_power9.S
|
||||
else
|
||||
CDOTKERNEL = cdot.c
|
||||
endif
|
||||
ZDOTKERNEL = zdot.c
|
||||
#
|
||||
SNRM2KERNEL = ../arm/nrm2.c
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2020, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL 1
|
||||
|
||||
static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 256 \n\t"
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
|
||||
"addi %3, %3, 256 \n\t"
|
||||
"addi %2, %2, 256 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
"stxv 41, 128(%3) \n\t"
|
||||
"stxv 40, 144(%3) \n\t"
|
||||
"stxv 43, 160(%3) \n\t"
|
||||
"stxv 42, 176(%3) \n\t"
|
||||
"stxv 45, 192(%3) \n\t"
|
||||
"stxv 44, 208(%3) \n\t"
|
||||
"stxv 47, 224(%3) \n\t"
|
||||
"stxv 46, 240(%3) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%0=%3"
|
||||
:
|
||||
"=m" (*y),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
|
||||
);
|
||||
}
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#include "copy_microk_power10.c"
|
||||
#include "ccopy_microk_power10.c"
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_KERNEL
|
||||
|
@ -86,7 +86,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
|
|
|
@ -28,6 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
|
||||
#include "common.h"
|
||||
#if defined(POWER10)
|
||||
#include "cdot_microk_power10.c"
|
||||
#else
|
||||
#ifndef HAVE_KERNEL_8
|
||||
#include <altivec.h>
|
||||
|
||||
|
@ -99,6 +102,7 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
|
|||
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) {
|
||||
|
@ -116,7 +120,11 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
|
|||
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
|
||||
#if defined(POWER10)
|
||||
BLASLONG n1 = n & -16;
|
||||
#else
|
||||
BLASLONG n1 = n & -8;
|
||||
#endif
|
||||
BLASLONG j=0;
|
||||
|
||||
if (n1){
|
||||
|
|
|
@ -0,0 +1,177 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void cdot_kernel_8 (long n, float *x, float *y, float *dot)
|
||||
{
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"dcbt 0, %3 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"xxperm 56, 48, %x7 \n\t"
|
||||
"xxperm 57, 49, %x7 \n\t"
|
||||
"xxperm 58, 50, %x7 \n\t"
|
||||
"xxperm 59, 51, %x7 \n\t"
|
||||
|
||||
"xxperm 60, 52, %x7 \n\t"
|
||||
"xxperm 61, 53, %x7 \n\t"
|
||||
"xxperm 62, 54, %x7 \n\t"
|
||||
"xxperm 63, 55, %x7 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
|
||||
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
|
||||
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
|
||||
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
||||
"xxperm 56, 48, %x7 \n\t"
|
||||
"xxperm 57, 49, %x7 \n\t"
|
||||
"xxperm 58, 50, %x7 \n\t"
|
||||
"xxperm 59, 51, %x7 \n\t"
|
||||
|
||||
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
|
||||
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"xxperm 60, 52, %x7 \n\t"
|
||||
"xxperm 61, 53, %x7 \n\t"
|
||||
"xxperm 62, 54, %x7 \n\t"
|
||||
"xxperm 63, 55, %x7 \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmaddasp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddasp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddasp 33, 40, 56 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 41, 57 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddasp 37, 42, 58 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 43, 59 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
"xvmaddasp 32, 44, 52 \n\t" // x0_r * y0_r , x0_i * y0_i
|
||||
"xvmaddasp 34, 45, 53 \n\t" // x1_r * y1_r , x1_i * y1_i
|
||||
"xvmaddasp 36, 46, 54 \n\t" // x2_r * y2_r , x2_i * y2_i
|
||||
"xvmaddasp 38, 47, 55 \n\t" // x3_r * y3_r , x3_i * y3_i
|
||||
|
||||
"xvmaddasp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
|
||||
"xvmaddasp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
|
||||
"xvmaddasp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
|
||||
"xvmaddasp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 33, 33, 35 \n\t"
|
||||
"xvaddsp 37, 37, 39 \n\t"
|
||||
|
||||
"xvaddsp 35, 32, 36 \n\t"
|
||||
"xvaddsp 34, 33, 37 \n\t"
|
||||
"xxswapd 32, 35 \n\t"
|
||||
"xxswapd 33, 34 \n\t"
|
||||
"xvaddsp 35, 35, 32 \n\t"
|
||||
"xvaddsp 34, 34, 33 \n\t"
|
||||
"xxpermdi 34, 34, 35, 2 \n\t"
|
||||
"stxv 34, 0(%6) \n\t"
|
||||
|
||||
"#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6"
|
||||
:
|
||||
"=m" (*dot),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"+b" (y) // 3
|
||||
:
|
||||
"m" (*x),
|
||||
"m" (*y),
|
||||
"b" (dot), // 6
|
||||
"wa" (mask)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
|
@ -62,38 +62,39 @@ static void copy_kernel (BLASLONG n, FLOAT *x, FLOAT *y)
|
|||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"stxvp 48, 256(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"stxvp 50, 288(%3) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"stxvp 52, 320(%3) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"stxvp 54, 352(%3) \n\t"
|
||||
"lxvp 48, 256(%2) \n\t"
|
||||
"lxvp 50, 288(%2) \n\t"
|
||||
"lxvp 52, 320(%2) \n\t"
|
||||
"lxvp 54, 352(%2) \n\t"
|
||||
|
||||
"stxvp 56, 384(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"stxvp 58, 416(%3) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"stxvp 60, 448(%3) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"stxvp 62, 480(%3) \n\t"
|
||||
"lxvp 56, 384(%2) \n\t"
|
||||
"lxvp 58, 416(%2) \n\t"
|
||||
"lxvp 60, 448(%2) \n\t"
|
||||
"lxvp 62, 480(%2) \n\t"
|
||||
|
||||
"addi %3, %3, 512 \n\t"
|
||||
|
|
|
@ -0,0 +1,176 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void zscal_kernel_8 (long n, float *x, float alpha_r, float alpha_i)
|
||||
{
|
||||
__vector float t0 = {-alpha_i, alpha_i, -alpha_i, alpha_i};
|
||||
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
"xscvdpspn 32, %x3 \n\t"
|
||||
"xxspltw 32, 32, 0 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
|
||||
"xxperm 34, 40, %x5 \n\t"
|
||||
"xxperm 35, 41, %x5 \n\t"
|
||||
"xxperm 36, 42, %x5 \n\t"
|
||||
"xxperm 37, 43, %x5 \n\t"
|
||||
"xxperm 38, 44, %x5 \n\t"
|
||||
"xxperm 39, 45, %x5 \n\t"
|
||||
"xxperm 56, 46, %x5 \n\t"
|
||||
"xxperm 57, 47, %x5 \n\t"
|
||||
|
||||
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmulsp 35, 35, %x4 \n\t"
|
||||
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
|
||||
"xvmulsp 36, 36, %x4 \n\t"
|
||||
"xvmulsp 37, 37, %x4 \n\t"
|
||||
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
|
||||
"xvmulsp 38, 38, %x4 \n\t"
|
||||
"xvmulsp 39, 39, %x4 \n\t"
|
||||
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
|
||||
"xvmulsp 56, 56, %x4 \n\t"
|
||||
"xvmulsp 57, 57, %x4 \n\t"
|
||||
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
"xvaddsp 48, 48, 34 \n\t"
|
||||
"xvaddsp 49, 49, 35 \n\t"
|
||||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmulsp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmulsp 49, 41, 32 \n\t"
|
||||
"xvmulsp 50, 42, 32 \n\t"
|
||||
"xvmulsp 51, 43, 32 \n\t"
|
||||
"xvmulsp 52, 44, 32 \n\t"
|
||||
"xvmulsp 53, 45, 32 \n\t"
|
||||
"xvmulsp 54, 46, 32 \n\t"
|
||||
"xvmulsp 55, 47, 32 \n\t"
|
||||
|
||||
"xxperm 34, 40, %x5 \n\t"
|
||||
"xxperm 35, 41, %x5 \n\t"
|
||||
"xxperm 36, 42, %x5 \n\t"
|
||||
"xxperm 37, 43, %x5 \n\t"
|
||||
"xxperm 38, 44, %x5 \n\t"
|
||||
"xxperm 39, 45, %x5 \n\t"
|
||||
"xxperm 56, 46, %x5 \n\t"
|
||||
"xxperm 57, 47, %x5 \n\t"
|
||||
|
||||
|
||||
"xvmulsp 34, 34, %x4 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmulsp 35, 35, %x4 \n\t"
|
||||
"xvmulsp 36, 36, %x4 \n\t"
|
||||
"xvmulsp 37, 37, %x4 \n\t"
|
||||
"xvmulsp 38, 38, %x4 \n\t"
|
||||
"xvmulsp 39, 39, %x4 \n\t"
|
||||
"xvmulsp 56, 56, %x4 \n\t"
|
||||
"xvmulsp 57, 57, %x4 \n\t"
|
||||
|
||||
"xvaddsp 48, 48, 34 \n\t"
|
||||
"xvaddsp 49, 49, 35 \n\t"
|
||||
"xvaddsp 50, 50, 36 \n\t"
|
||||
"xvaddsp 51, 51, 37 \n\t"
|
||||
|
||||
"stxvp 48, 0(%2) \n\t"
|
||||
|
||||
"xvaddsp 52, 52, 38 \n\t"
|
||||
"xvaddsp 53, 53, 39 \n\t"
|
||||
|
||||
"stxvp 50, 32(%2) \n\t"
|
||||
|
||||
"xvaddsp 54, 54, 56 \n\t"
|
||||
"xvaddsp 55, 55, 57 \n\t"
|
||||
|
||||
"stxvp 52, 64(%2) \n\t"
|
||||
"stxvp 54, 96(%2) \n\t"
|
||||
|
||||
"#n=%1 x=%0=%2 alpha=(%3,%4)\n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
"f" (alpha_r), // 3
|
||||
"wa" (t0), // 4
|
||||
"wa" (mask) // 5
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57"
|
||||
);
|
||||
}
|
|
@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "cswap_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,127 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#if defined(DOUBLE)
|
||||
#define HAVE_KERNEL_16 1
|
||||
static void zswap_kernel_16 (long n, double *x, double *y)
|
||||
#else
|
||||
#define HAVE_KERNEL_32 1
|
||||
static void cswap_kernel_32 (long n, float *x, float *y)
|
||||
#endif
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
"lxvp 32, 0(%4) \n\t"
|
||||
"lxvp 34, 32(%4) \n\t"
|
||||
"lxvp 36, 64(%4) \n\t"
|
||||
"lxvp 38, 96(%4) \n\t"
|
||||
|
||||
"lxvp 40, 128(%4) \n\t"
|
||||
"lxvp 42, 160(%4) \n\t"
|
||||
"lxvp 44, 192(%4) \n\t"
|
||||
"lxvp 46, 224(%4) \n\t"
|
||||
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"lxvp 56, 128(%3) \n\t"
|
||||
"lxvp 58, 160(%3) \n\t"
|
||||
"lxvp 60, 192(%3) \n\t"
|
||||
"lxvp 62, 224(%3) \n\t"
|
||||
|
||||
|
||||
"stxv 33, 0(%3) \n\t"
|
||||
"stxv 32, 16(%3) \n\t"
|
||||
"stxv 35, 32(%3) \n\t"
|
||||
"stxv 34, 48(%3) \n\t"
|
||||
"stxv 37, 64(%3) \n\t"
|
||||
"stxv 36, 80(%3) \n\t"
|
||||
"stxv 39, 96(%3) \n\t"
|
||||
"stxv 38, 112(%3) \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxv 41, 0(%3) \n\t"
|
||||
"stxv 40, 16(%3) \n\t"
|
||||
"stxv 43, 32(%3) \n\t"
|
||||
"stxv 42, 48(%3) \n\t"
|
||||
"stxv 45, 64(%3) \n\t"
|
||||
"stxv 44, 80(%3) \n\t"
|
||||
"stxv 47, 96(%3) \n\t"
|
||||
"stxv 46, 112(%3) \n\t"
|
||||
|
||||
"addi %3, %3, 128 \n\t"
|
||||
|
||||
"stxv 49, 0(%4) \n\t"
|
||||
"stxv 48, 16(%4) \n\t"
|
||||
"stxv 51, 32(%4) \n\t"
|
||||
"stxv 50, 48(%4) \n\t"
|
||||
"stxv 53, 64(%4) \n\t"
|
||||
"stxv 52, 80(%4) \n\t"
|
||||
"stxv 55, 96(%4) \n\t"
|
||||
"stxv 54, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
"stxv 57, 0(%4) \n\t"
|
||||
"stxv 56, 16(%4) \n\t"
|
||||
"stxv 59, 32(%4) \n\t"
|
||||
"stxv 58, 48(%4) \n\t"
|
||||
"stxv 61, 64(%4) \n\t"
|
||||
"stxv 60, 80(%4) \n\t"
|
||||
"stxv 63, 96(%4) \n\t"
|
||||
"stxv 62, 112(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 128 \n\t"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
"addic. %2, %2, -16 \n\t"
|
||||
#else
|
||||
"addic. %2, %2, -32 \n\t"
|
||||
#endif
|
||||
"bgt one%= \n"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
|
@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dasum_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
sumf += ABS(x[i]);
|
||||
}
|
||||
}
|
||||
n1 = (n-i) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sumf += dasum_kernel_16(n1, &x[i]);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
sumf = dasum_kernel_16(n1, x);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,152 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static double dasum_kernel_16 (long n, double *x)
|
||||
{
|
||||
double sum;
|
||||
__vector double t0;
|
||||
__vector double t1;
|
||||
__vector double t2;
|
||||
__vector double t3;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
|
||||
|
||||
"xvabsdp %x3, 44 \n\t"
|
||||
"xvabsdp %x4, 45 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
||||
|
||||
"xvabsdp %x5, 46 \n\t"
|
||||
"xvabsdp %x6, 47 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvadddp 36, 36, %x3 \n\t"
|
||||
"xvadddp 37, 37, %x4 \n\t"
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"xvadddp 38, 38, %x5 \n\t"
|
||||
"xvadddp 39, 39, %x6 \n\t"
|
||||
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvabsdp 48, 40 \n\t"
|
||||
"xvabsdp 49, 41 \n\t"
|
||||
"xvabsdp 50, 42 \n\t"
|
||||
"xvabsdp 51, 43 \n\t"
|
||||
"xvabsdp %x3, 44 \n\t"
|
||||
"xvabsdp %x4, 45 \n\t"
|
||||
"xvabsdp %x5, 46 \n\t"
|
||||
"xvabsdp %x6, 47 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 48 \n\t"
|
||||
"xvadddp 33, 33, 49 \n\t"
|
||||
"xvadddp 34, 34, 50 \n\t"
|
||||
"xvadddp 35, 35, 51 \n\t"
|
||||
"xvadddp 36, 36, %x3 \n\t"
|
||||
"xvadddp 37, 37, %x4 \n\t"
|
||||
"xvadddp 38, 38, %x5 \n\t"
|
||||
"xvadddp 39, 39, %x6 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 33 \n\t"
|
||||
"xvadddp 34, 34, 35 \n\t"
|
||||
"xvadddp 36, 36, 37 \n\t"
|
||||
"xvadddp 38, 38, 39 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 34 \n\t"
|
||||
"xvadddp 36, 36, 38 \n\t"
|
||||
|
||||
"xvadddp 32, 32, 36 \n\t"
|
||||
|
||||
XXSWAPD_S(33,32)
|
||||
"xsadddp %x0, 32, 33 \n"
|
||||
|
||||
"#n=%1 x=%3=%2 sum=%0\n"
|
||||
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
|
||||
:
|
||||
"=d" (sum), // 0
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"=wa" (t0), // 3
|
||||
"=wa" (t1), // 4
|
||||
"=wa" (t2), // 5
|
||||
"=wa" (t3) // 6
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
|
|
@ -85,12 +85,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
|
||||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -64;
|
||||
if ( n1 > 0 )
|
||||
if ( n >= 64 )
|
||||
{
|
||||
copy_kernel(n1, x, y);
|
||||
i=n1;
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] = x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -64;
|
||||
if ( n1 )
|
||||
{
|
||||
copy_kernel(n1, &x[i], &y[i]);
|
||||
i += n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
|
|
|
@ -29,7 +29,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
typedef __vector unsigned char vec_t;
|
||||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
|
||||
typedef FLOAT v2sf_t __attribute__ ((vector_size (8)));
|
||||
#if !__has_builtin(__builtin_vsx_assemble_pair)
|
||||
#define __builtin_vsx_assemble_pair __builtin_mma_assemble_pair
|
||||
#endif
|
||||
|
||||
#if !__has_builtin(__builtin_vsx_disassemble_pair)
|
||||
#define __builtin_vsx_disassemble_pair __builtin_mma_disassemble_pair
|
||||
#endif
|
||||
|
||||
#ifdef TRMMKERNEL
|
||||
#define SAVE_ACC(ACC, J) \
|
||||
|
@ -186,8 +192,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
|
@ -200,8 +206,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
|
@ -242,8 +248,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
|
||||
|
@ -252,8 +258,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
|
||||
|
@ -286,16 +292,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB, rowB1;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 3];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
|
||||
}
|
||||
|
@ -398,7 +404,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
|
||||
|
@ -407,7 +413,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
{
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
__builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
|
||||
|
@ -440,14 +446,14 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
}
|
||||
|
@ -476,13 +482,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
rb = (vec_t *) & BO[l << 2];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
SAVE_ACC (&acc0, 0);
|
||||
|
@ -562,11 +568,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1, acc2, acc3;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
|
@ -574,9 +578,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
__builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 3];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
|
@ -607,19 +610,16 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
v4sf_t result[4];
|
||||
__vector_quad acc0, acc1;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 2];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
__builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
|
||||
|
@ -646,18 +646,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
|
|||
v4sf_t result[4];
|
||||
__vector_quad acc0;
|
||||
BLASLONG l = 0;
|
||||
FLOAT t[4] = { 0, 0, 0, 0 };
|
||||
t[0] = BO[0], t[1] = BO[1];
|
||||
__vector_pair rowB;
|
||||
vec_t *rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
vec_t *rb = (vec_t *) & BO[0];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
vec_t *rowA = (vec_t *) & AO[0];
|
||||
__builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
|
||||
for (l = 1; l < temp; l++)
|
||||
{
|
||||
t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
|
||||
rb = (vec_t *) & t[0];
|
||||
__builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
|
||||
rb = (vec_t *) & BO[l << 1];
|
||||
__builtin_vsx_assemble_pair (&rowB, rb[0], rb[0]);
|
||||
rowA = (vec_t *) & AO[l << 1];
|
||||
__builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
|
||||
}
|
||||
|
|
|
@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "drot_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "drot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
drot_kernel_16(n1,&x[i], &y[i], c, s);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
drot_kernel_16(n1, x1, y1, c, s);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,148 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
XXSPLTD_S(36,%x5,0) // load c to both dwords
|
||||
XXSPLTD_S(37,%x6,0) // load s to both dwords
|
||||
"lxvp 32, 0(%3) \n\t" // load x
|
||||
"lxvp 34, 32(%3) \n\t"
|
||||
"lxvp 48, 0(%4) \n\t" // load y
|
||||
"lxvp 50, 32(%4) \n\t"
|
||||
|
||||
"addic. %2, %2, -8 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvp 32, 64(%3) \n\t" // load x
|
||||
"lxvp 34, 96(%3) \n\t"
|
||||
"xvmuldp 52, 48, 36 \n\t" // c * y
|
||||
"xvmuldp 53, 49, 36 \n\t"
|
||||
"xvmuldp 54, 50, 36 \n\t"
|
||||
"xvmuldp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 38, 48, 37 \n\t" // s * y
|
||||
"xvmuldp 39, 49, 37 \n\t"
|
||||
"xvmuldp 56, 50, 37 \n\t"
|
||||
"xvmuldp 57, 51, 37 \n\t"
|
||||
|
||||
"lxvp 48, 64(%4) \n\t" // load y
|
||||
"lxvp 50, 96(%4) \n\t"
|
||||
|
||||
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
|
||||
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"addi %4, %4, 64 \n\t"
|
||||
|
||||
"addic. %2, %2, -8 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 36 \n\t" // c * x
|
||||
"xvmuldp 41, 33, 36 \n\t"
|
||||
"xvmuldp 42, 34, 36 \n\t"
|
||||
"xvmuldp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmuldp 52, 48, 36 \n\t" // c * y
|
||||
"xvmuldp 53, 49, 36 \n\t"
|
||||
"xvmuldp 54, 50, 36 \n\t"
|
||||
"xvmuldp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmuldp 44, 32, 37 \n\t" // s * x
|
||||
"xvmuldp 45, 33, 37 \n\t"
|
||||
"xvmuldp 46, 34, 37 \n\t"
|
||||
"xvmuldp 47, 35, 37 \n\t"
|
||||
|
||||
"xvmuldp 38, 48, 37 \n\t" // s * y
|
||||
"xvmuldp 39, 49, 37 \n\t"
|
||||
"xvmuldp 56, 50, 37 \n\t"
|
||||
"xvmuldp 57, 51, 37 \n\t"
|
||||
|
||||
"xvadddp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvadddp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvadddp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvadddp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
"xvsubdp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubdp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubdp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubdp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
"d" (c), // 5
|
||||
"d" (s) // 6
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57"
|
||||
);
|
||||
}
|
|
@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dscal_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "dscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -100,12 +102,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (j = 0; j < align; j++) {
|
||||
x[j] = 0.0;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-j) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_8_zero(n1, &x[j]);
|
||||
j+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_8_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
@ -118,12 +136,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (j = 0; j < align; j++) {
|
||||
x[j] = da * x[j];
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-j) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_8(n1, &x[j], da);
|
||||
j+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dscal_kernel_8(n1, x, da);
|
||||
j=n1;
|
||||
}
|
||||
#endif
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
|
|
|
@ -0,0 +1,134 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void dscal_kernel_8 (long n, double *x, double alpha)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
XXSPLTD_S(48,%x3,0)
|
||||
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 48 \n\t"
|
||||
"xvmuldp 41, 33, 48 \n\t"
|
||||
"xvmuldp 42, 34, 48 \n\t"
|
||||
"xvmuldp 43, 35, 48 \n\t"
|
||||
"lxvp 32, 128(%2) \n\t"
|
||||
"lxvp 34, 160(%2) \n\t"
|
||||
"xvmuldp 44, 36, 48 \n\t"
|
||||
"xvmuldp 45, 37, 48 \n\t"
|
||||
"xvmuldp 46, 38, 48 \n\t"
|
||||
"xvmuldp 47, 39, 48 \n\t"
|
||||
"lxvp 36, 192(%2) \n\t"
|
||||
"lxvp 38, 224(%2) \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmuldp 40, 32, 48 \n\t"
|
||||
"xvmuldp 41, 33, 48 \n\t"
|
||||
"xvmuldp 42, 34, 48 \n\t"
|
||||
"xvmuldp 43, 35, 48 \n\t"
|
||||
|
||||
"xvmuldp 44, 36, 48 \n\t"
|
||||
"xvmuldp 45, 37, 48 \n\t"
|
||||
"xvmuldp 46, 38, 48 \n\t"
|
||||
"xvmuldp 47, 39, 48 \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
|
||||
"#n=%1 alpha=%3 x=%0=%2"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
"d" (alpha) // 3
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static void dscal_kernel_8_zero (long n, double *x)
|
||||
{
|
||||
|
||||
__asm__
|
||||
(
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%2) \n\t"
|
||||
"stxvp 32, 32(%2) \n\t"
|
||||
"stxvp 32, 64(%2) \n\t"
|
||||
"stxvp 32, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"#n=%1 x=%0=%2 "
|
||||
:
|
||||
"=m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
:
|
||||
"cr0","vs32","vs33"
|
||||
);
|
||||
}
|
|
@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "dswap_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = y[i];
|
||||
y[i] = x[i];
|
||||
x[i] = temp;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dswap_kernel_32(n1,&x[i], &y[i]);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
dswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#endif
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sasum_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sasum_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -110,6 +112,21 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
if ( inc_x == 1 )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (i = 0; i < align; i++) {
|
||||
sumf += ABS(x[i]);
|
||||
}
|
||||
}
|
||||
n1 = (n-i) & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sumf += sasum_kernel_32(n1, &x[i]);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
@ -117,6 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
sumf = sasum_kernel_32(n1, x);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,153 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
static float sasum_kernel_32 (long n, float *x)
|
||||
{
|
||||
float sum;
|
||||
__vector float t0;
|
||||
__vector float t1;
|
||||
__vector float t2;
|
||||
__vector float t3;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
"xxlxor 34, 34, 34 \n\t"
|
||||
"xxlxor 35, 35, 35 \n\t"
|
||||
"xxlxor 36, 36, 36 \n\t"
|
||||
"xxlxor 37, 37, 37 \n\t"
|
||||
"xxlxor 38, 38, 38 \n\t"
|
||||
"xxlxor 39, 39, 39 \n\t"
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"addi %2, %2, 128 \n\t"
|
||||
"xvaddsp 36, 36, %x3 \n\t"
|
||||
"xvaddsp 37, 37, %x4 \n\t"
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"xvaddsp 38, 38, %x5 \n\t"
|
||||
"xvaddsp 39, 39, %x6 \n\t"
|
||||
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvabssp 48, 40 \n\t"
|
||||
"xvabssp 49, 41 \n\t"
|
||||
"xvabssp 50, 42 \n\t"
|
||||
"xvabssp 51, 43 \n\t"
|
||||
"xvabssp %x3, 44 \n\t"
|
||||
"xvabssp %x4, 45 \n\t"
|
||||
"xvabssp %x5, 46 \n\t"
|
||||
"xvabssp %x6, 47 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 48 \n\t"
|
||||
"xvaddsp 33, 33, 49 \n\t"
|
||||
"xvaddsp 34, 34, 50 \n\t"
|
||||
"xvaddsp 35, 35, 51 \n\t"
|
||||
"xvaddsp 36, 36, %x3 \n\t"
|
||||
"xvaddsp 37, 37, %x4 \n\t"
|
||||
"xvaddsp 38, 38, %x5 \n\t"
|
||||
"xvaddsp 39, 39, %x6 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
"xvaddsp 34, 34, 35 \n\t"
|
||||
"xvaddsp 36, 36, 37 \n\t"
|
||||
"xvaddsp 38, 38, 39 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 34 \n\t"
|
||||
"xvaddsp 36, 36, 38 \n\t"
|
||||
|
||||
"xvaddsp 32, 32, 36 \n\t"
|
||||
|
||||
"xxsldwi 33, 32, 32, 2 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xxsldwi 33, 32, 32, 1 \n\t"
|
||||
"xvaddsp 32, 32, 33 \n\t"
|
||||
|
||||
"xscvspdp %x0, 32 \n"
|
||||
|
||||
"#n=%1 x=%3=%2 sum=%0\n"
|
||||
"#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
|
||||
:
|
||||
"=f" (sum), // 0
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"=wa" (t0), // 3
|
||||
"=wa" (t1), // 4
|
||||
"=wa" (t2), // 5
|
||||
"=wa" (t3) // 6
|
||||
:
|
||||
"m" (*x)
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
|
||||
return sum;
|
||||
}
|
|
@ -86,11 +86,18 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
BLASLONG n1 = n & -128;
|
||||
if ( n1 > 0 )
|
||||
if ( n >= 128 )
|
||||
{
|
||||
copy_kernel (n1, x, y);
|
||||
i=n1;
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (i = 0; i < align; i++) {
|
||||
y[i] = x[i] ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -128;
|
||||
if ( n1 )
|
||||
{
|
||||
copy_kernel(n1, &x[i], &y[i]);
|
||||
i += n1;
|
||||
}
|
||||
|
||||
while(i < n)
|
||||
|
|
|
@ -39,9 +39,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "srot_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "srot_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -115,6 +117,23 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
if ( (inc_x == 1) && (inc_y == 1) )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 16 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
y[i] = c*y[i] - s*x[i] ;
|
||||
x[i] = temp ;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
srot_kernel_16(n1, &x1[i], &y1[i], c, s);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -16;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
|
@ -122,6 +141,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
|
|||
i=n1;
|
||||
}
|
||||
|
||||
#endif
|
||||
while(i < n)
|
||||
{
|
||||
temp = c*x[i] + s*y[i] ;
|
||||
|
|
|
@ -0,0 +1,151 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"xscvdpspn 36, %x5 \n\t" // load c to all words
|
||||
"xxspltw 36, 36, 0 \n\t"
|
||||
|
||||
"xscvdpspn 37, %x6 \n\t" // load s to all words
|
||||
"xxspltw 37, 37, 0 \n\t"
|
||||
"lxvp 32, 0(%3) \n\t" // load x
|
||||
"lxvp 34, 32(%3) \n\t"
|
||||
"lxvp 48, 0(%4) \n\t" // load y
|
||||
"lxvp 50, 32(%4) \n\t"
|
||||
|
||||
"addic. %2, %2, -16 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 41, 33, 36 \n\t"
|
||||
"xvmulsp 42, 34, 36 \n\t"
|
||||
"xvmulsp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
|
||||
"lxvp 32, 64(%3) \n\t" // load x
|
||||
"lxvp 34, 96(%3) \n\t"
|
||||
"xvmulsp 52, 48, 36 \n\t" // c * y
|
||||
"xvmulsp 53, 49, 36 \n\t"
|
||||
"xvmulsp 54, 50, 36 \n\t"
|
||||
"xvmulsp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmulsp 38, 48, 37 \n\t" // s * y
|
||||
"xvmulsp 39, 49, 37 \n\t"
|
||||
"xvmulsp 56, 50, 37 \n\t"
|
||||
"xvmulsp 57, 51, 37 \n\t"
|
||||
|
||||
"lxvp 48, 64(%4) \n\t" // load y
|
||||
"lxvp 50, 96(%4) \n\t"
|
||||
|
||||
"xvaddsp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvaddsp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvaddsp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
|
||||
"xvsubsp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubsp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubsp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"addi %3, %3, 64 \n\t"
|
||||
"addi %4, %4, 64 \n\t"
|
||||
|
||||
"addic. %2, %2, -16 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmulsp 40, 32, 36 \n\t" // c * x
|
||||
"xvmulsp 41, 33, 36 \n\t"
|
||||
"xvmulsp 42, 34, 36 \n\t"
|
||||
"xvmulsp 43, 35, 36 \n\t"
|
||||
|
||||
"xvmulsp 52, 48, 36 \n\t" // c * y
|
||||
"xvmulsp 53, 49, 36 \n\t"
|
||||
"xvmulsp 54, 50, 36 \n\t"
|
||||
"xvmulsp 55, 51, 36 \n\t"
|
||||
|
||||
"xvmulsp 44, 32, 37 \n\t" // s * x
|
||||
"xvmulsp 45, 33, 37 \n\t"
|
||||
"xvmulsp 46, 34, 37 \n\t"
|
||||
"xvmulsp 47, 35, 37 \n\t"
|
||||
|
||||
"xvmulsp 38, 48, 37 \n\t" // s * y
|
||||
"xvmulsp 39, 49, 37 \n\t"
|
||||
"xvmulsp 56, 50, 37 \n\t"
|
||||
"xvmulsp 57, 51, 37 \n\t"
|
||||
|
||||
"xvaddsp 40, 40, 38 \n\t" // c * x + s * y
|
||||
"xvaddsp 41, 41, 39 \n\t" // c * x + s * y
|
||||
"xvaddsp 42, 42, 56 \n\t" // c * x + s * y
|
||||
"xvaddsp 43, 43, 57 \n\t" // c * x + s * y
|
||||
|
||||
"stxvp 40, 0(%3) \n\t" // store x
|
||||
"stxvp 42, 32(%3) \n\t"
|
||||
"xvsubsp 52, 52, 44 \n\t" // c * y - s * x
|
||||
"xvsubsp 53, 53, 45 \n\t" // c * y - s * x
|
||||
"xvsubsp 54, 54, 46 \n\t" // c * y - s * x
|
||||
"xvsubsp 55, 55, 47 \n\t" // c * y - s * x
|
||||
|
||||
"stxvp 52, 0(%4) \n\t" // store y
|
||||
"stxvp 54, 32(%4) \n\t"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4 c=%5 s=%6\n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
"f" (c), // 5
|
||||
"f" (s) // 6
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57"
|
||||
);
|
||||
}
|
|
@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sscal_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "sscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -102,12 +104,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
if ( da == 0.0 )
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (j = 0; j < align; j++) {
|
||||
x[j] = 0.0;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-j) & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_16_zero(n1, &x[j]);
|
||||
j+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_16_zero(n1, x);
|
||||
j=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(j < n)
|
||||
{
|
||||
|
@ -120,12 +138,28 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
|||
else
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 32 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (j = 0; j < align; j++) {
|
||||
x[j] = da * x[j];
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-j) & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_16(n1, &x[j], da);
|
||||
j+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sscal_kernel_16(n1, x, da);
|
||||
j=n1;
|
||||
}
|
||||
#endif
|
||||
while(j < n)
|
||||
{
|
||||
|
||||
|
|
|
@ -0,0 +1,135 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_16 1
|
||||
|
||||
static void sscal_kernel_16 (long n, float *x, float alpha)
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"xscvdpspn 48, %x3 \n\t"
|
||||
"xxspltw 48, 48, 0 \n\t"
|
||||
|
||||
"lxvp 32, 0(%2) \n\t"
|
||||
"lxvp 34, 32(%2) \n\t"
|
||||
"lxvp 36, 64(%2) \n\t"
|
||||
"lxvp 38, 96(%2) \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmulsp 40, 32, 48 \n\t"
|
||||
"xvmulsp 41, 33, 48 \n\t"
|
||||
"xvmulsp 42, 34, 48 \n\t"
|
||||
"xvmulsp 43, 35, 48 \n\t"
|
||||
"lxvp 32, 128(%2) \n\t"
|
||||
"lxvp 34, 160(%2) \n\t"
|
||||
"xvmulsp 44, 36, 48 \n\t"
|
||||
"xvmulsp 45, 37, 48 \n\t"
|
||||
"xvmulsp 46, 38, 48 \n\t"
|
||||
"xvmulsp 47, 39, 48 \n\t"
|
||||
"lxvp 36, 192(%2) \n\t"
|
||||
"lxvp 38, 224(%2) \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmulsp 40, 32, 48 \n\t"
|
||||
"xvmulsp 41, 33, 48 \n\t"
|
||||
"xvmulsp 42, 34, 48 \n\t"
|
||||
"xvmulsp 43, 35, 48 \n\t"
|
||||
|
||||
"xvmulsp 44, 36, 48 \n\t"
|
||||
"xvmulsp 45, 37, 48 \n\t"
|
||||
"xvmulsp 46, 38, 48 \n\t"
|
||||
"xvmulsp 47, 39, 48 \n\t"
|
||||
|
||||
"stxvp 40, 0(%2) \n\t"
|
||||
"stxvp 42, 32(%2) \n\t"
|
||||
"stxvp 44, 64(%2) \n\t"
|
||||
"stxvp 46, 96(%2) \n\t"
|
||||
|
||||
"#n=%1 alpha=%3 x=%0=%2"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
"f" (alpha) // 3
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47","vs48"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
static void sscal_kernel_16_zero (long n, float *x)
|
||||
{
|
||||
|
||||
__asm__
|
||||
(
|
||||
"xxlxor 32, 32, 32 \n\t"
|
||||
"xxlxor 33, 33, 33 \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"stxvp 32, 0(%2) \n\t"
|
||||
"stxvp 32, 32(%2) \n\t"
|
||||
"stxvp 32, 64(%2) \n\t"
|
||||
"stxvp 32, 96(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -32 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"#n=%1 x=%0=%2 "
|
||||
:
|
||||
"=m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x) // 2
|
||||
:
|
||||
:
|
||||
"cr0","vs32","vs33"
|
||||
);
|
||||
}
|
|
@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#include "common.h"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "sswap_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "swap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -115,12 +117,30 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x,
|
|||
if ( (inc_x == 1) && (inc_y == 1 ))
|
||||
{
|
||||
|
||||
#if defined(POWER10)
|
||||
if ( n >= 64 )
|
||||
{
|
||||
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
|
||||
for (i = 0; i < align; i++) {
|
||||
temp = y[i];
|
||||
y[i] = x[i];
|
||||
x[i] = temp;
|
||||
}
|
||||
}
|
||||
BLASLONG n1 = (n-i) & -64;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sswap_kernel_32(n1,&x[i], &y[i]);
|
||||
i+=n1;
|
||||
}
|
||||
#else
|
||||
BLASLONG n1 = n & -32;
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
sswap_kernel_32(n1, x, y);
|
||||
i=n1;
|
||||
}
|
||||
#endif
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -0,0 +1,105 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
#define HAVE_KERNEL_32 1
|
||||
|
||||
#if defined(DOUBLE)
|
||||
static void dswap_kernel_32 (long n, double *x, double *y)
|
||||
#else
|
||||
static void sswap_kernel_32 (long n, float *x, float *y)
|
||||
#endif
|
||||
{
|
||||
__asm__
|
||||
(
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"lxvp 32, 0(%4) \n\t"
|
||||
"lxvp 34, 32(%4) \n\t"
|
||||
"lxvp 36, 64(%4) \n\t"
|
||||
"lxvp 38, 96(%4) \n\t"
|
||||
|
||||
"lxvp 40, 128(%4) \n\t"
|
||||
"lxvp 42, 160(%4) \n\t"
|
||||
"lxvp 44, 192(%4) \n\t"
|
||||
"lxvp 46, 224(%4) \n\t"
|
||||
|
||||
"lxvp 48, 0(%3) \n\t"
|
||||
"lxvp 50, 32(%3) \n\t"
|
||||
"lxvp 52, 64(%3) \n\t"
|
||||
"lxvp 54, 96(%3) \n\t"
|
||||
|
||||
"lxvp 56, 128(%3) \n\t"
|
||||
"lxvp 58, 160(%3) \n\t"
|
||||
"lxvp 60, 192(%3) \n\t"
|
||||
"lxvp 62, 224(%3) \n\t"
|
||||
|
||||
"stxvp 32, 0(%3) \n\t"
|
||||
"stxvp 34, 32(%3) \n\t"
|
||||
"stxvp 36, 64(%3) \n\t"
|
||||
"stxvp 38, 96(%3) \n\t"
|
||||
|
||||
"stxvp 40, 128(%3) \n\t"
|
||||
"stxvp 42, 160(%3) \n\t"
|
||||
"stxvp 44, 192(%3) \n\t"
|
||||
"stxvp 46, 224(%3) \n\t"
|
||||
|
||||
"stxvp 48, 0(%4) \n\t"
|
||||
"stxvp 50, 32(%4) \n\t"
|
||||
"stxvp 52, 64(%4) \n\t"
|
||||
"stxvp 54, 96(%4) \n\t"
|
||||
|
||||
"stxvp 56, 128(%4) \n\t"
|
||||
"stxvp 58, 160(%4) \n\t"
|
||||
"stxvp 60, 192(%4) \n\t"
|
||||
"stxvp 62, 224(%4) \n\t"
|
||||
|
||||
"addi %4, %4, 256 \n\t"
|
||||
"addi %3, %3, 256 \n\t"
|
||||
|
||||
#if defined(DOUBLE)
|
||||
"addic. %2, %2, -32 \n\t"
|
||||
#else
|
||||
"addic. %2, %2, -64 \n\t"
|
||||
#endif
|
||||
"bgt one%= \n"
|
||||
|
||||
"#n=%2 x=%0=%3 y=%1=%4"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+m" (*y),
|
||||
"+r" (n), // 2
|
||||
"+b" (x), // 3
|
||||
"+b" (y) // 4
|
||||
:
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
|
||||
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
|
||||
);
|
||||
}
|
|
@ -38,11 +38,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#pragma GCC optimize "O1"
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power8.c"
|
||||
#endif
|
||||
#elif defined(POWER10)
|
||||
#if defined(DOUBLE)
|
||||
#include "zscal_microk_power10.c"
|
||||
#else
|
||||
#include "cscal_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -145,7 +151,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
|
|||
{
|
||||
|
||||
|
||||
#if defined(DOUBLE)
|
||||
n1 = n & -8;
|
||||
#else
|
||||
n1 = n & -16;
|
||||
#endif
|
||||
if ( n1 > 0 )
|
||||
{
|
||||
zscal_kernel_8(n1, x, da_r, da_i);
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#define HAVE_KERNEL_8 1
|
||||
|
||||
static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
|
||||
{
|
||||
__vector double t0;
|
||||
__vector double t1;
|
||||
__vector double t2;
|
||||
__vector double t3;
|
||||
__vector double t4;
|
||||
__vector double t5;
|
||||
|
||||
__asm__
|
||||
(
|
||||
"dcbt 0, %2 \n\t"
|
||||
|
||||
"xsnegdp 33, %x10 \n\t" // -alpha_i
|
||||
XXSPLTD_S(32,%x9,0) // alpha_r , alpha_r
|
||||
XXMRGHD_S(33,%x10, 33) // -alpha_i , alpha_i
|
||||
|
||||
"lxvp 40, 0(%2) \n\t"
|
||||
"lxvp 42, 32(%2) \n\t"
|
||||
"lxvp 44, 64(%2) \n\t"
|
||||
"lxvp 46, 96(%2) \n\t"
|
||||
|
||||
"addic. %1, %1, -8 \n\t"
|
||||
"ble two%= \n\t"
|
||||
|
||||
".align 5 \n"
|
||||
"one%=: \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"xvmuldp 34, 44, 32 \n\t"
|
||||
"xvmuldp 35, 45, 32 \n\t"
|
||||
"xvmuldp 36, 46, 32 \n\t"
|
||||
"xvmuldp 37, 47, 32 \n\t"
|
||||
|
||||
XXSWAPD_S(38,40)
|
||||
XXSWAPD_S(39,41)
|
||||
XXSWAPD_S(%x3,42)
|
||||
XXSWAPD_S(%x4,43)
|
||||
XXSWAPD_S(%x5,44)
|
||||
XXSWAPD_S(%x6,45)
|
||||
XXSWAPD_S(%x7,46)
|
||||
XXSWAPD_S(%x8,47)
|
||||
|
||||
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmuldp 39, 39, 33 \n\t"
|
||||
|
||||
|
||||
"xvmuldp %x3, %x3, 33 \n\t"
|
||||
"xvmuldp %x4, %x4, 33 \n\t"
|
||||
|
||||
|
||||
"lxvp 40, 128(%2) \n\t"
|
||||
"lxvp 42, 160(%2) \n\t"
|
||||
"xvmuldp %x5, %x5, 33 \n\t"
|
||||
"xvmuldp %x6, %x6, 33 \n\t"
|
||||
|
||||
|
||||
"xvmuldp %x7, %x7, 33 \n\t"
|
||||
"xvmuldp %x8, %x8, 33 \n\t"
|
||||
"lxvp 44, 192(%2) \n\t"
|
||||
"lxvp 46, 224(%2) \n\t"
|
||||
|
||||
|
||||
"xvadddp 48, 48, 38 \n\t"
|
||||
"xvadddp 49, 49, 39 \n\t"
|
||||
"xvadddp 50, 50, %x3 \n\t"
|
||||
"xvadddp 51, 51, %x4 \n\t"
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
|
||||
|
||||
"xvadddp 34, 34, %x5 \n\t"
|
||||
"xvadddp 35, 35, %x6 \n\t"
|
||||
|
||||
|
||||
"xvadddp 36, 36, %x7 \n\t"
|
||||
"xvadddp 37, 37, %x8 \n\t"
|
||||
|
||||
"stxv 35, 64(%2) \n\t"
|
||||
"stxv 34, 80(%2) \n\t"
|
||||
"stxv 37, 96(%2) \n\t"
|
||||
"stxv 36, 112(%2) \n\t"
|
||||
|
||||
"addi %2, %2, 128 \n\t"
|
||||
|
||||
"addic. %1, %1, -8 \n\t"
|
||||
"bgt one%= \n"
|
||||
|
||||
"two%=: \n\t"
|
||||
|
||||
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
|
||||
"xvmuldp 49, 41, 32 \n\t"
|
||||
"xvmuldp 50, 42, 32 \n\t"
|
||||
"xvmuldp 51, 43, 32 \n\t"
|
||||
"xvmuldp 34, 44, 32 \n\t"
|
||||
"xvmuldp 35, 45, 32 \n\t"
|
||||
"xvmuldp 36, 46, 32 \n\t"
|
||||
"xvmuldp 37, 47, 32 \n\t"
|
||||
|
||||
XXSWAPD_S(38,40)
|
||||
XXSWAPD_S(39,41)
|
||||
XXSWAPD_S(%x3,42)
|
||||
XXSWAPD_S(%x4,43)
|
||||
XXSWAPD_S(%x5,44)
|
||||
XXSWAPD_S(%x6,45)
|
||||
XXSWAPD_S(%x7,46)
|
||||
XXSWAPD_S(%x8,47)
|
||||
|
||||
|
||||
"xvmuldp 38, 38, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
|
||||
"xvmuldp 39, 39, 33 \n\t"
|
||||
"xvmuldp %x3, %x3, 33 \n\t"
|
||||
"xvmuldp %x4, %x4, 33 \n\t"
|
||||
"xvmuldp %x5, %x5, 33 \n\t"
|
||||
"xvmuldp %x6, %x6, 33 \n\t"
|
||||
"xvmuldp %x7, %x7, 33 \n\t"
|
||||
"xvmuldp %x8, %x8, 33 \n\t"
|
||||
|
||||
"xvadddp 48, 48, 38 \n\t"
|
||||
"xvadddp 49, 49, 39 \n\t"
|
||||
|
||||
"xvadddp 50, 50, %x3 \n\t"
|
||||
"xvadddp 51, 51, %x4 \n\t"
|
||||
"stxv 49, 0(%2) \n\t"
|
||||
"stxv 48, 16(%2) \n\t"
|
||||
"stxv 51, 32(%2) \n\t"
|
||||
"stxv 50, 48(%2) \n\t"
|
||||
|
||||
"xvadddp 34, 34, %x5 \n\t"
|
||||
"xvadddp 35, 35, %x6 \n\t"
|
||||
|
||||
|
||||
"xvadddp 36, 36, %x7 \n\t"
|
||||
"xvadddp 37, 37, %x8 \n\t"
|
||||
|
||||
"stxv 35, 64(%2) \n\t"
|
||||
"stxv 34, 80(%2) \n\t"
|
||||
"stxv 37, 96(%2) \n\t"
|
||||
"stxv 36, 112(%2) \n\t"
|
||||
|
||||
"#n=%1 x=%0=%2 alpha=(%9,%10) \n"
|
||||
:
|
||||
"+m" (*x),
|
||||
"+r" (n), // 1
|
||||
"+b" (x), // 2
|
||||
"=wa" (t0), // 3
|
||||
"=wa" (t1), // 4
|
||||
"=wa" (t2), // 5
|
||||
"=wa" (t3), // 6
|
||||
"=wa" (t4), // 7
|
||||
"=wa" (t5) // 8
|
||||
:
|
||||
"d" (alpha_r), // 9
|
||||
"d" (alpha_i) // 10
|
||||
:
|
||||
"cr0",
|
||||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
|
||||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
|
||||
"vs48","vs49","vs50","vs51"
|
||||
);
|
||||
}
|
|
@ -36,9 +36,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
|
||||
#if defined(POWER8) || defined(POWER9) || defined(POWER10)
|
||||
#if defined(__VEC__) || defined(__ALTIVEC__)
|
||||
#if defined(POWER8) || defined(POWER9)
|
||||
#include "zswap_microk_power8.c"
|
||||
#elif defined(POWER10)
|
||||
#include "cswap_microk_power10.c"
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -489,3 +489,6 @@ XGEMM3MKERNEL = xgemm3m_kernel_2x2.S
|
|||
|
||||
SSUMKERNEL = ../arm/sum.c
|
||||
DSUMKERNEL = ../arm/sum.c
|
||||
|
||||
SOMATCOPY_RT = omatcopy_rt.c
|
||||
DOMATCOPY_RT = omatcopy_rt.c
|
||||
|
|
|
@ -97,3 +97,5 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
|||
CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_haswell.c
|
||||
|
||||
SROTKERNEL = srot.c
|
||||
DROTKERNEL = drot.c
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "dasum_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "dasum_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
@ -93,7 +93,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#if defined(SMP)
|
||||
int nthreads;
|
||||
FLOAT dummy_alpha;
|
||||
FLOAT * dummy_b;
|
||||
#endif
|
||||
FLOAT sumf = 0.0;
|
||||
|
||||
|
@ -115,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
#else
|
||||
mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#endif
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
|
||||
ptr = (FLOAT *)result;
|
||||
for (i = 0; i < nthreads; i++) {
|
||||
sumf += (*ptr);
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "drot_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "drot_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,373 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2021, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef HAVE_AVX
|
||||
|
||||
#define ROWS_OF_BLOCK 384
|
||||
|
||||
/* +r: %0 = src, %1 = dst, %2 = src_ld, %3 = dst_ld, %4 = dst_tmp */
|
||||
/* m: %5 = num_rows, %6 = alpha */
|
||||
/* xmm15 = alpha */
|
||||
#define TRANS_4x4(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
|
||||
"vunpcklps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t1_no"; vunpckhps %%xmm"#a2_no",%%xmm"#a1_no",%%xmm"#t2_no";"\
|
||||
"vunpcklps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t3_no"; vunpckhps %%xmm"#a4_no",%%xmm"#a3_no",%%xmm"#t4_no";"\
|
||||
"vunpcklpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a1_no"; vunpckhpd %%xmm"#t3_no",%%xmm"#t1_no",%%xmm"#a2_no";"\
|
||||
"vunpcklpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a3_no"; vunpckhpd %%xmm"#t4_no",%%xmm"#t2_no",%%xmm"#a4_no";"
|
||||
|
||||
#define TRANS_4x8(a1_no,a2_no,a3_no,a4_no,t1_no,t2_no,t3_no,t4_no)\
|
||||
"vunpcklps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t1_no"; vunpckhps %%ymm"#a2_no",%%ymm"#a1_no",%%ymm"#t2_no";"\
|
||||
"vunpcklps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t3_no"; vunpckhps %%ymm"#a4_no",%%ymm"#a3_no",%%ymm"#t4_no";"\
|
||||
"vunpcklpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a1_no"; vunpckhpd %%ymm"#t3_no",%%ymm"#t1_no",%%ymm"#a2_no";"\
|
||||
"vunpcklpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a3_no"; vunpckhpd %%ymm"#t4_no",%%ymm"#t2_no",%%ymm"#a4_no";"
|
||||
|
||||
#define SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
|
||||
"vmovups %%xmm"#b1_no",(%4); vmovups %%xmm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
|
||||
"vmovups %%xmm"#b3_no",(%4); vmovups %%xmm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
|
||||
|
||||
#define SAVE_4x8(b1_no,b2_no,b3_no,b4_no) SAVE_4x4(b1_no,b2_no,b3_no,b4_no)\
|
||||
"vextractf128 $1,%%ymm"#b1_no",(%4); vextractf128 $1,%%ymm"#b2_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
|
||||
"vextractf128 $1,%%ymm"#b3_no",(%4); vextractf128 $1,%%ymm"#b4_no",(%4,%3,1); leaq (%4,%3,2),%4;"
|
||||
|
||||
#define COPY_4x16 "movq %1,%4; addq $16,%1;"\
|
||||
"vmulps (%0),%%ymm15,%%ymm0; vmulps 32(%0),%%ymm15,%%ymm4; vmulps (%0,%2,1),%%ymm15,%%ymm1; vmulps 32(%0,%2,1),%%ymm15,%%ymm5; leaq (%0,%2,2),%0;"\
|
||||
"vmulps (%0),%%ymm15,%%ymm2; vmulps 32(%0),%%ymm15,%%ymm6; vmulps (%0,%2,1),%%ymm15,%%ymm3; vmulps 32(%0,%2,1),%%ymm15,%%ymm7; leaq (%0,%2,2),%0;"\
|
||||
TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)\
|
||||
TRANS_4x8(4,5,6,7,8,9,10,11) SAVE_4x8(4,5,6,7)
|
||||
|
||||
#define COPY_4x8 "movq %1,%4; addq $16,%1;"\
|
||||
"vmulps (%0),%%ymm15,%%ymm0; vmulps (%0,%2,1),%%ymm15,%%ymm1; leaq (%0,%2,2),%0;"\
|
||||
"vmulps (%0),%%ymm15,%%ymm2; vmulps (%0,%2,1),%%ymm15,%%ymm3; leaq (%0,%2,2),%0;"\
|
||||
TRANS_4x8(0,1,2,3,8,9,10,11) SAVE_4x8(0,1,2,3)
|
||||
|
||||
#define COPY_4x4 "movq %1,%4; addq $16,%1;"\
|
||||
"vmulps (%0),%%xmm15,%%xmm0; vmulps (%0,%2,1),%%xmm15,%%xmm1; leaq (%0,%2,2),%0;"\
|
||||
"vmulps (%0),%%xmm15,%%xmm2; vmulps (%0,%2,1),%%xmm15,%%xmm3; leaq (%0,%2,2),%0;"\
|
||||
TRANS_4x4(0,1,2,3,8,9,10,11) SAVE_4x4(0,1,2,3)
|
||||
|
||||
#define COPY_4x2 \
|
||||
"vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
|
||||
"vmovsd (%0),%%xmm1; vmovhpd (%0,%2,1),%%xmm1,%%xmm1; vmulps %%xmm15,%%xmm1,%%xmm1; leaq (%0,%2,2),%0;"\
|
||||
"vpermilps $216,%%xmm0,%%xmm0; vpermilps $216,%%xmm1,%%xmm1; vunpcklpd %%xmm1,%%xmm0,%%xmm2; vunpckhpd %%xmm1,%%xmm0,%%xmm3;"\
|
||||
"vmovups %%xmm2,(%1); vmovups %%xmm3,(%1,%3,1); addq $16,%1;"
|
||||
|
||||
#define COPY_4x1 \
|
||||
"vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
|
||||
"vinsertps $32,(%0),%%xmm0,%%xmm0; vinsertps $48,(%0,%2,1),%%xmm0,%%xmm0; leaq (%0,%2,2),%0;"\
|
||||
"vmulps %%xmm15,%%xmm0,%%xmm0; vmovups %%xmm0,(%1); addq $16,%1;"
|
||||
|
||||
#define SAVE_2x4(c1_no,c2_no,t1_no,t2_no) \
|
||||
"vunpcklps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t1_no"; vmulps %%xmm15,%%xmm"#t1_no",%%xmm"#t1_no";"\
|
||||
"vmovsd %%xmm"#t1_no",(%4); vmovhpd %%xmm"#t1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
|
||||
"vunpckhps %%xmm"#c2_no",%%xmm"#c1_no",%%xmm"#t2_no"; vmulps %%xmm15,%%xmm"#t2_no",%%xmm"#t2_no";"\
|
||||
"vmovsd %%xmm"#t2_no",(%4); vmovhpd %%xmm"#t2_no",(%4,%3,1); leaq (%4,%3,2),%4;"
|
||||
|
||||
#define COPY_2x16 "movq %1,%4; addq $8,%1;"\
|
||||
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm2; vmovups (%0,%2,1),%%ymm1; vmovups 32(%0,%2,1),%%ymm3; leaq (%0,%2,2),%0;"\
|
||||
"vextractf128 $1,%%ymm0,%%xmm4; vextractf128 $1,%%ymm2,%%xmm6; vextractf128 $1,%%ymm1,%%xmm5; vextractf128 $1,%%ymm3,%%xmm7;"\
|
||||
SAVE_2x4(0,1,8,9) SAVE_2x4(4,5,8,9) SAVE_2x4(2,3,8,9) SAVE_2x4(6,7,8,9)
|
||||
|
||||
#define COPY_2x8 "movq %1,%4; addq $8,%1;"\
|
||||
"vmovups (%0),%%ymm0; vmovups (%0,%2,1),%%ymm1; leaq (%0,%2,2),%0;"\
|
||||
"vextractf128 $1,%%ymm0,%%xmm2; vextractf128 $1,%%ymm1,%%xmm3;"\
|
||||
SAVE_2x4(0,1,4,5) SAVE_2x4(2,3,4,5)
|
||||
|
||||
#define COPY_2x4 "movq %1,%4; addq $8,%1;"\
|
||||
"vmovups (%0),%%xmm0; vmovups (%0,%2,1),%%xmm1; leaq (%0,%2,2),%0;"\
|
||||
SAVE_2x4(0,1,4,5)
|
||||
|
||||
#define COPY_2x2 \
|
||||
"vmovsd (%0),%%xmm0; vmovhpd (%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vpermilps $216,%%xmm0,%%xmm0;"\
|
||||
"vmovsd %%xmm0,(%1); vmovhpd %%xmm0,(%1,%3,1); addq $8,%1;"
|
||||
|
||||
#define COPY_2x1 \
|
||||
"vmovss (%0),%%xmm0; vinsertps $16,(%0,%2,1),%%xmm0,%%xmm0; vmulps %%xmm15,%%xmm0,%%xmm0; leaq (%0,%2,2),%0; vmovsd %%xmm0,(%1); addq $8,%1;"
|
||||
|
||||
#define SAVE_1x4(c1_no)\
|
||||
"vmulps %%xmm15,%%xmm"#c1_no",%%xmm"#c1_no"; vmovss %%xmm"#c1_no",(%4); vextractps $1,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"\
|
||||
"vextractps $2,%%xmm"#c1_no",(%4); vextractps $3,%%xmm"#c1_no",(%4,%3,1); leaq (%4,%3,2),%4;"
|
||||
|
||||
#define COPY_1x16 "movq %1,%4; addq $4,%1;"\
|
||||
"vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2)\
|
||||
"vmovups 32(%0),%%xmm1;" SAVE_1x4(1) "vmovups 48(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
|
||||
|
||||
#define COPY_1x8 "movq %1,%4; addq $4,%1;"\
|
||||
"vmovups (%0),%%xmm1;" SAVE_1x4(1) "vmovups 16(%0),%%xmm2;" SAVE_1x4(2) "addq %2,%0;"
|
||||
|
||||
#define COPY_1x4 "movq %1,%4; addq $4,%1; vmovups (%0),%%xmm1;" SAVE_1x4(1) "addq %2,%0;"
|
||||
|
||||
#define COPY_1x2 "vmovsd (%0),%%xmm1; addq %2,%0; vmulps %%xmm15,%%xmm1,%%xmm1; vmovss %%xmm1,(%1); vextractps $1,%%xmm1,(%1,%3,1); addq $4,%1;"
|
||||
|
||||
#define COPY_1x1 "vmulss (%0),%%xmm15,%%xmm1; vmovss %%xmm1,(%1); addq %2,%0; addq $4,%1;"
|
||||
|
||||
#define COMPUTE(ndim){\
|
||||
src = src_base; dst = dst_base;\
|
||||
__asm__ __volatile__(\
|
||||
"vbroadcastss %6,%%ymm15; movq %5,%%r11; cmpq $4,%%r11; jb "#ndim"32f;"\
|
||||
#ndim"31:\n\t"\
|
||||
COPY_4x##ndim "subq $4,%%r11; cmpq $4,%%r11; jnb "#ndim"31b;"\
|
||||
#ndim"32:\n\t"\
|
||||
"cmpq $2,%%r11; jb "#ndim"33f;"\
|
||||
COPY_2x##ndim "subq $2,%%r11;"\
|
||||
#ndim"33:\n\t"\
|
||||
"testq %%r11,%%r11; jz "#ndim"34f;"\
|
||||
COPY_1x##ndim "subq $1,%%r11;"\
|
||||
#ndim"34:\n\t"\
|
||||
:"+r"(src),"+r"(dst),"+r"(src_ld_bytes),"+r"(dst_ld_bytes),"+r"(dst_tmp):"m"(num_rows),"m"(ALPHA):"r11","cc","memory"\
|
||||
,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||
}
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
|
||||
float *src, *dst, *dst_tmp, *src_base, *dst_base;
|
||||
uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
|
||||
BLASLONG cols_left, rows_done; float ALPHA = alpha;
|
||||
if(ALPHA==0.0){
|
||||
dst_base = b;
|
||||
for(cols_left=cols;cols_left>0;cols_left--) {memset(dst_base,0,rows*sizeof(float)); dst_base += ldb;}
|
||||
return 0;
|
||||
}
|
||||
for(rows_done=0;rows_done<rows;rows_done+=num_rows){
|
||||
num_rows = rows-rows_done;
|
||||
if(num_rows > ROWS_OF_BLOCK) num_rows = ROWS_OF_BLOCK;
|
||||
cols_left = cols; src_base = a + (int64_t)lda * (int64_t)rows_done; dst_base = b + rows_done;
|
||||
if(ldb%1024>3 && ldb%1024<1021) for(;cols_left>15;cols_left-=16){COMPUTE(16) src_base += 16; dst_base += 16 * ldb;}
|
||||
for(;cols_left>7;cols_left-=8){COMPUTE(8) src_base += 8; dst_base += 8 * ldb;}
|
||||
for(;cols_left>3;cols_left-=4){COMPUTE(4) src_base += 4; dst_base += 4 * ldb;}
|
||||
for(;cols_left>1;cols_left-=2){COMPUTE(2) src_base += 2; dst_base += 2 * ldb;}
|
||||
if(cols_left>0){COMPUTE(1) src_base ++; dst_base += ldb;}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3, *b_offset4;
|
||||
|
||||
if (rows <= 0) return 0;
|
||||
if (cols <= 0) return 0;
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
i = (rows >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 4;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0) {
|
||||
do {
|
||||
/* Column 1 of MAT_B */
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha; // Row 1 of MAT_A
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
/* Column 2 of MAT_B */
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha; // Row 2 of MAT_A
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
/* Column 3 of MAT_B */
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha; // Row 3 of MAT_A
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
*(b_offset3 + 2) = *(a_offset3 + 2)*alpha;
|
||||
*(b_offset4 + 2) = *(a_offset3 + 3)*alpha;
|
||||
|
||||
/* Column 4 of MAT_B */
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha; // Row 4 of MAT_A
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
*(b_offset3 + 3) = *(a_offset4 + 2)*alpha;
|
||||
*(b_offset4 + 3) = *(a_offset4 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
a_offset3 += 4;
|
||||
a_offset4 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
} // if(j > 0)
|
||||
|
||||
|
||||
if (cols & 2) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
*(b_offset2 + 2) = *(a_offset3 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
*(b_offset2 + 3) = *(a_offset4 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
a_offset3 += 2;
|
||||
a_offset4 += 2;
|
||||
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
if (cols & 1) {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 2) = *(a_offset3 + 0)*alpha;
|
||||
|
||||
*(b_offset1 + 3) = *(a_offset4 + 0)*alpha;
|
||||
}
|
||||
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
|
||||
|
||||
if (rows & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
b_offset += 2;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
*(b_offset3 + 1) = *(a_offset2 + 2)*alpha;
|
||||
*(b_offset4 + 1) = *(a_offset2 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
*(b_offset2 + 1) = *(a_offset2 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
b_offset1 += ldb*2;
|
||||
|
||||
}
|
||||
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset1 + 1) = *(a_offset2 + 0)*alpha;
|
||||
}
|
||||
} // if (rows & 2)
|
||||
|
||||
|
||||
if (rows & 1) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset += lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset2 = b_offset1 + ldb;
|
||||
b_offset3 = b_offset2 + ldb;
|
||||
b_offset4 = b_offset3 + ldb;
|
||||
|
||||
j = (cols >> 2);
|
||||
if (j > 0){
|
||||
do {
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
*(b_offset3 + 0) = *(a_offset1 + 2)*alpha;
|
||||
*(b_offset4 + 0) = *(a_offset1 + 3)*alpha;
|
||||
|
||||
a_offset1 += 4;
|
||||
b_offset1 += ldb * 4;
|
||||
b_offset2 += ldb * 4;
|
||||
b_offset3 += ldb * 4;
|
||||
b_offset4 += ldb * 4;
|
||||
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
if (cols & 2){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
*(b_offset2 + 0) = *(a_offset1 + 1)*alpha;
|
||||
|
||||
a_offset1 += 2;
|
||||
b_offset1 += ldb * 2;
|
||||
}
|
||||
|
||||
if (cols & 1){
|
||||
*(b_offset1 + 0) = *(a_offset1 + 0)*alpha;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif
|
|
@ -11,7 +11,7 @@
|
|||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "sasum_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "sasum_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
|
|
@ -0,0 +1,426 @@
|
|||
#include "sbgemm.h"
|
||||
|
||||
#include <immintrin.h>
|
||||
// Walk around those intrinsics that missed by compiler
|
||||
#define MM256_LOADU_EPI16(addr) \
|
||||
_mm256_maskz_loadu_epi16(~0, (addr))
|
||||
#define MM256_STOREU_EPI16(addr, reg) \
|
||||
_mm256_mask_storeu_epi16((addr), ~0, (reg))
|
||||
|
||||
#include <stdio.h>
|
||||
void print_block(BLASLONG m, BLASLONG n, bfloat16 * mat)
|
||||
{
|
||||
printf("---- BLOCK %ld x %ld ----\n", m, n);
|
||||
for (BLASLONG i=0; i<m; i++) {
|
||||
for (BLASLONG j=0; j<n; j++) {
|
||||
printf("%-4X ", *(mat + i*n +j));
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
printf("---- End of BLOCK ----\n");
|
||||
}
|
||||
|
||||
void COL_MAJOR_INCOPY_KERNEL_Kx32(BLASLONG k, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
|
||||
{
|
||||
BLASLONG tag_k_2x = k & (~1);
|
||||
|
||||
__m512i array512_0, array512_1, array512_2, array512_3;
|
||||
|
||||
BLASLONG idx_src_base0, idx_src_base1;
|
||||
BLASLONG idx_target_base0, idx_target_base1;
|
||||
|
||||
BLASLONG LDA_2x = 2*lda;
|
||||
BLASLONG BF16_BLOCK_T_M_2x = 2*32;
|
||||
idx_src_base0 = 0;
|
||||
idx_src_base1 = lda;
|
||||
idx_target_base0 = 0;
|
||||
idx_target_base1 = 32;
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
|
||||
array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
|
||||
array512_1 = _mm512_loadu_si512(&A[idx_src_base1]);
|
||||
array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
|
||||
array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
|
||||
|
||||
idx_src_base0 += LDA_2x;
|
||||
idx_src_base1 += LDA_2x;
|
||||
idx_target_base0 += BF16_BLOCK_T_M_2x;
|
||||
idx_target_base1 += BF16_BLOCK_T_M_2x;
|
||||
}
|
||||
|
||||
if (tag_k_2x != k) {
|
||||
__m512i ZERO512 = _mm512_setzero_si512();
|
||||
array512_0 = _mm512_loadu_si512(&A[idx_src_base0]);
|
||||
array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
|
||||
array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
|
||||
#endif
|
||||
}
|
||||
|
||||
void COL_MAJOR_INCOPY_KERNEL_Kx32m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
|
||||
{
|
||||
BLASLONG tag_k_2x = k & (~1);
|
||||
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-m));
|
||||
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
|
||||
|
||||
__m512i array512_0, array512_1, array512_2, array512_3;
|
||||
|
||||
BLASLONG idx_src_base0, idx_src_base1;
|
||||
BLASLONG idx_target_base0, idx_target_base1;
|
||||
|
||||
BLASLONG LDA_2x = 2*lda;
|
||||
BLASLONG BF16_BLOCK_T_M_2x = 2*32;
|
||||
idx_src_base0 = 0;
|
||||
idx_src_base1 = lda;
|
||||
idx_target_base0 = 0;
|
||||
idx_target_base1 = 32;
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
|
||||
array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
|
||||
array512_1 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
|
||||
array512_2 = _mm512_unpacklo_epi16(array512_0, array512_1);
|
||||
array512_3 = _mm512_unpackhi_epi16(array512_0, array512_1);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
|
||||
|
||||
idx_src_base0 += LDA_2x;
|
||||
idx_src_base1 += LDA_2x;
|
||||
idx_target_base0 += BF16_BLOCK_T_M_2x;
|
||||
idx_target_base1 += BF16_BLOCK_T_M_2x;
|
||||
}
|
||||
|
||||
if (tag_k_2x != k) {
|
||||
__m512i ZERO512 = _mm512_setzero_si512();
|
||||
array512_0 = _mm512_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
|
||||
array512_2 = _mm512_unpacklo_epi16(array512_0, ZERO512);
|
||||
array512_3 = _mm512_unpackhi_epi16(array512_0, ZERO512);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base0], array512_2);
|
||||
_mm512_storeu_si512(&block_A[idx_target_base1], array512_3);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
|
||||
#endif
|
||||
}
|
||||
|
||||
void COL_MAJOR_INCOPY_KERNEL_Kx16(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
|
||||
{
|
||||
BLASLONG tag_k_2x = k & (~1);
|
||||
|
||||
__m256i array256_0, array256_1, array256_2, array256_3;
|
||||
|
||||
BLASLONG idx_src_base0, idx_src_base1;
|
||||
BLASLONG idx_target_base0;
|
||||
|
||||
BLASLONG LDA_2x = 2*lda;
|
||||
idx_src_base0 = 0;
|
||||
idx_src_base1 = lda;
|
||||
idx_target_base0 = 0;
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
|
||||
array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
|
||||
array256_1 = MM256_LOADU_EPI16(&A[idx_src_base1]);
|
||||
array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
|
||||
array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
|
||||
// Store in one row of block_B
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
|
||||
|
||||
idx_src_base0 += LDA_2x;
|
||||
idx_src_base1 += LDA_2x;
|
||||
idx_target_base0 += 32;
|
||||
}
|
||||
|
||||
if (tag_k_2x != k) {
|
||||
__m256i ZERO256 = _mm256_setzero_si256();
|
||||
array256_0 = MM256_LOADU_EPI16(&A[idx_src_base0]);
|
||||
array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
|
||||
array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
|
||||
// Store in one row of block_B
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
|
||||
#endif
|
||||
}
|
||||
|
||||
void COL_MAJOR_INCOPY_KERNEL_Kx16m(BLASLONG k, BLASLONG m, bfloat16 * A, BLASLONG lda, bfloat16 * block_A)
|
||||
{
|
||||
BLASLONG tag_k_2x = k & (~1);
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
|
||||
__m256i array256_0, array256_1, array256_2, array256_3;
|
||||
|
||||
BLASLONG idx_src_base0, idx_src_base1;
|
||||
BLASLONG idx_target_base0;
|
||||
|
||||
BLASLONG LDA_2x = 2*lda;
|
||||
idx_src_base0 = 0;
|
||||
idx_src_base1 = lda;
|
||||
idx_target_base0 = 0;
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_2x; idx_k += 2) {
|
||||
array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
|
||||
array256_1 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base1]);
|
||||
array256_2 = _mm256_unpacklo_epi16(array256_0, array256_1);
|
||||
array256_3 = _mm256_unpackhi_epi16(array256_0, array256_1);
|
||||
// Store in one row of block_B
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
|
||||
|
||||
idx_src_base0 += LDA_2x;
|
||||
idx_src_base1 += LDA_2x;
|
||||
idx_target_base0 += 32;
|
||||
}
|
||||
|
||||
if (tag_k_2x != k) {
|
||||
__m256i ZERO256 = _mm256_setzero_si256();
|
||||
array256_0 = _mm256_maskz_loadu_epi16(tail_mask, &A[idx_src_base0]);
|
||||
array256_2 = _mm256_unpacklo_epi16(array256_0, ZERO256);
|
||||
array256_3 = _mm256_unpackhi_epi16(array256_0, ZERO256);
|
||||
// Store in one row of block_B
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0], array256_2);
|
||||
MM256_STOREU_EPI16(&block_A[idx_target_base0 + 16], array256_3);
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_K, BF16_BLOCK_THRES_M, block_A);
|
||||
#endif
|
||||
}
|
||||
|
||||
void COL_MAJOR_ONCOPY_KERNEL_8x32(BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
|
||||
{
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG idx_src_base0, idx_src_base1, idx_src_base2, idx_src_base3, idx_src_base4, idx_src_base5, idx_src_base6, idx_src_base7;
|
||||
BLASLONG idx_target_base0;
|
||||
|
||||
idx_src_base0 = 0;
|
||||
idx_src_base1 = 1*ldb;
|
||||
idx_src_base2 = 2*ldb;
|
||||
idx_src_base3 = 3*ldb;
|
||||
idx_src_base4 = 4*ldb;
|
||||
idx_src_base5 = 5*ldb;
|
||||
idx_src_base6 = 6*ldb;
|
||||
idx_src_base7 = 7*ldb;
|
||||
idx_target_base0 = 0;
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base1+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_loadu_si512(&B[idx_src_base2+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_loadu_si512(&B[idx_src_base3+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_loadu_si512(&B[idx_src_base4+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_loadu_si512(&B[idx_src_base5+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_loadu_si512(&B[idx_src_base6+idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_loadu_si512(&B[idx_src_base7+idx_k]));
|
||||
idx_target_base0 += 32*8;
|
||||
}
|
||||
|
||||
if (tag_k_32x != k) {
|
||||
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
|
||||
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base1+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*2], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base2+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*3], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base3+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*4], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base4+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*5], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base5+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*6], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base6+tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*7], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base7+tag_k_32x]));
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
|
||||
#endif
|
||||
}
|
||||
|
||||
void COL_MAJOR_ONCOPY_KERNEL_Nx32(BLASLONG n, BLASLONG k, bfloat16 * B, BLASLONG ldb, bfloat16 * block_B)
|
||||
{
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG tag_n_2x = n & (~1);
|
||||
BLASLONG idx_src_base0;
|
||||
BLASLONG idx_target_base0;
|
||||
|
||||
BLASLONG LDB_2x = 2*ldb;
|
||||
|
||||
idx_target_base0 = 0;
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < tag_k_32x; idx_k += 32) {
|
||||
idx_src_base0 = 0;
|
||||
for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_loadu_si512(&B[idx_src_base0 + ldb + idx_k]));
|
||||
idx_src_base0 += LDB_2x;
|
||||
idx_target_base0 += 64;
|
||||
}
|
||||
|
||||
if (tag_n_2x != n) {
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0], _mm512_loadu_si512(&B[idx_src_base0 + idx_k]));
|
||||
idx_target_base0 += 32;
|
||||
}
|
||||
}
|
||||
|
||||
if (tag_k_32x != k) {
|
||||
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(k-tag_k_32x)));
|
||||
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
|
||||
idx_src_base0 = 0;
|
||||
for (BLASLONG idx_n = 0; idx_n < tag_n_2x; idx_n += 2) {
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x]));
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0+ 32*1], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + ldb + tag_k_32x]));
|
||||
idx_src_base0 += LDB_2x;
|
||||
idx_target_base0 += 64;
|
||||
}
|
||||
|
||||
if (tag_n_2x != n) {
|
||||
_mm512_storeu_si512(&block_B[idx_target_base0], _mm512_maskz_loadu_epi16(tail_mask, &B[idx_src_base0 + tag_k_32x]));
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef DEBUG_PROFILE
|
||||
print_block(BF16_BLOCK_THRES_N, BF16_BLOCK_THRES_K, block_B);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Scale matrix C while beta is not ZERO or ONE
|
||||
void sbgemm_scal_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc)
|
||||
{
|
||||
BLASLONG tag_n_Nx = N & (~3);
|
||||
BLASLONG tag_n_Mx = M & (~15);
|
||||
|
||||
BLASLONG LDC4x = ldc*4;
|
||||
BLASLONG idx_base_0 = 0;
|
||||
BLASLONG idx_base_1 = ldc;
|
||||
BLASLONG idx_base_2 = ldc*2;
|
||||
BLASLONG idx_base_3 = ldc*3;
|
||||
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
|
||||
__m512 array_512_0, array_512_1, array_512_2, array_512_3;
|
||||
|
||||
__m512 BETAVECTOR = _mm512_set1_ps(beta);
|
||||
|
||||
if (Order == CblasColMajor) {
|
||||
for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
|
||||
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
|
||||
array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
|
||||
array_512_1 = _mm512_loadu_ps(&C[idx_base_1+idx_m]);
|
||||
array_512_2 = _mm512_loadu_ps(&C[idx_base_2+idx_m]);
|
||||
array_512_3 = _mm512_loadu_ps(&C[idx_base_3+idx_m]);
|
||||
|
||||
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
|
||||
array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
|
||||
array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
|
||||
array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
|
||||
|
||||
_mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
|
||||
_mm512_storeu_ps(&C[idx_base_1+idx_m], array_512_1);
|
||||
_mm512_storeu_ps(&C[idx_base_2+idx_m], array_512_2);
|
||||
_mm512_storeu_ps(&C[idx_base_3+idx_m], array_512_3);
|
||||
}
|
||||
|
||||
if (tag_n_Mx != M) {
|
||||
array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
|
||||
array_512_1 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_1+tag_n_Mx]);
|
||||
array_512_2 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_2+tag_n_Mx]);
|
||||
array_512_3 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_3+tag_n_Mx]);
|
||||
|
||||
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
|
||||
array_512_1 = _mm512_mul_ps(BETAVECTOR, array_512_1);
|
||||
array_512_2 = _mm512_mul_ps(BETAVECTOR, array_512_2);
|
||||
array_512_3 = _mm512_mul_ps(BETAVECTOR, array_512_3);
|
||||
|
||||
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, array_512_1);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, array_512_2);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, array_512_3);
|
||||
}
|
||||
|
||||
idx_base_0 += LDC4x;
|
||||
idx_base_1 += LDC4x;
|
||||
idx_base_2 += LDC4x;
|
||||
idx_base_3 += LDC4x;
|
||||
}
|
||||
|
||||
if (tag_n_Nx != N) {
|
||||
for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
|
||||
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
|
||||
array_512_0 = _mm512_loadu_ps(&C[idx_base_0+idx_m]);
|
||||
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
|
||||
_mm512_storeu_ps(&C[idx_base_0+idx_m], array_512_0);
|
||||
}
|
||||
|
||||
if (tag_n_Mx != M) {
|
||||
array_512_0 = _mm512_maskz_loadu_ps(tail_mask, &C[idx_base_0+tag_n_Mx]);
|
||||
array_512_0 = _mm512_mul_ps(BETAVECTOR, array_512_0);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, array_512_0);
|
||||
}
|
||||
idx_base_0 += ldc;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// Scale matrix C while beta is not ZERO or ONE
|
||||
void sbgemm_zero_operation(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, float *C, OPENBLAS_CONST blasint ldc)
|
||||
{
|
||||
BLASLONG tag_n_Nx = N & (~3);
|
||||
BLASLONG tag_n_Mx = M & (~15);
|
||||
|
||||
BLASLONG LDC4x = ldc*4;
|
||||
BLASLONG idx_base_0 = 0;
|
||||
BLASLONG idx_base_1 = ldc;
|
||||
BLASLONG idx_base_2 = ldc*2;
|
||||
BLASLONG idx_base_3 = ldc*3;
|
||||
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-M+tag_n_Mx));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
|
||||
__m512 ZEROVECTOR = _mm512_setzero_ps();
|
||||
|
||||
if (Order == CblasColMajor) {
|
||||
for (BLASLONG idx_n = 0; idx_n < tag_n_Nx; idx_n += 4) {
|
||||
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
|
||||
_mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
|
||||
_mm512_storeu_ps(&C[idx_base_1+idx_m], ZEROVECTOR);
|
||||
_mm512_storeu_ps(&C[idx_base_2+idx_m], ZEROVECTOR);
|
||||
_mm512_storeu_ps(&C[idx_base_3+idx_m], ZEROVECTOR);
|
||||
}
|
||||
|
||||
if (tag_n_Mx != M) {
|
||||
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_1+tag_n_Mx], tail_mask, ZEROVECTOR);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_2+tag_n_Mx], tail_mask, ZEROVECTOR);
|
||||
_mm512_mask_storeu_ps(&C[idx_base_3+tag_n_Mx], tail_mask, ZEROVECTOR);
|
||||
}
|
||||
|
||||
idx_base_0 += LDC4x;
|
||||
idx_base_1 += LDC4x;
|
||||
idx_base_2 += LDC4x;
|
||||
idx_base_3 += LDC4x;
|
||||
}
|
||||
|
||||
if (tag_n_Nx != N) {
|
||||
for (BLASLONG idx_n = tag_n_Nx; idx_n < N; idx_n++) {
|
||||
for (BLASLONG idx_m = 0; idx_m < tag_n_Mx; idx_m += 16) {
|
||||
_mm512_storeu_ps(&C[idx_base_0+idx_m], ZEROVECTOR);
|
||||
}
|
||||
|
||||
if (tag_n_Mx != M) {
|
||||
_mm512_mask_storeu_ps(&C[idx_base_0+tag_n_Mx], tail_mask, ZEROVECTOR);
|
||||
}
|
||||
idx_base_0 += ldc;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,625 @@
|
|||
#include "sbgemm.h"
|
||||
#include "bf16_common_macros.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
#undef STORE16_COMPLETE_RESULT
|
||||
#undef STORE16_MASK_COMPLETE_RESULT
|
||||
#undef SBGEMM_BLOCK_KERNEL_32x8x32
|
||||
#undef SBGEMM_BLOCK_KERNEL_16x8x32
|
||||
#undef SBGEMM_BLOCK_KERNEL_32xNx32
|
||||
#undef SBGEMM_BLOCK_KERNEL_16xNx32
|
||||
#undef SBGEMM_BLOCKING_KERNEL_2
|
||||
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE
|
||||
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
|
||||
#define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_alpha
|
||||
#define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_alpha
|
||||
#define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_alpha
|
||||
#define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_alpha
|
||||
#define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_alpha
|
||||
#else // ALPHA is ONE
|
||||
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ONE_ONE
|
||||
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ONE_ONE
|
||||
#define SBGEMM_BLOCK_KERNEL_32x8x32 sbgemm_block_kernel_32x8x32_one
|
||||
#define SBGEMM_BLOCK_KERNEL_16x8x32 sbgemm_block_kernel_16x8x32_one
|
||||
#define SBGEMM_BLOCK_KERNEL_32xNx32 sbgemm_block_kernel_32xNx32_one
|
||||
#define SBGEMM_BLOCK_KERNEL_16xNx32 sbgemm_block_kernel_16xNx32_one
|
||||
#define SBGEMM_BLOCKING_KERNEL_2 sbgemm_blocking_kernel_2_one
|
||||
#endif
|
||||
|
||||
|
||||
// SBGEMM Kernel for 16<M<=32, N=8, K can be any number, but the processing will take 32 as a base
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_block_kernel_32x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_block_kernel_32x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#endif
|
||||
{
|
||||
int SHUFFLE_MAGIC_NO = 0x39;
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG idxA_base = 0;
|
||||
BLASLONG idxB_base = 0;
|
||||
BLASLONG width = 32;
|
||||
|
||||
#ifndef ONE_ALPHA
|
||||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
|
||||
__m512i arrayA_512_0, arrayA_512_1;
|
||||
__m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
|
||||
__m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7,
|
||||
result_512_8, result_512_9, result_512_10, result_512_11, result_512_12, result_512_13, result_512_14, result_512_15;
|
||||
__m512 result_512_tmp_0, result_512_tmp_1, result_512_tmp_2, result_512_tmp_3;
|
||||
|
||||
__m512i M512_EPI32_8 = _mm512_set1_epi32(8);
|
||||
__m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
|
||||
__m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
|
||||
|
||||
result_512_0 = _mm512_setzero_ps();
|
||||
result_512_1 = _mm512_setzero_ps();
|
||||
result_512_2 = _mm512_setzero_ps();
|
||||
result_512_3 = _mm512_setzero_ps();
|
||||
result_512_4 = _mm512_setzero_ps();
|
||||
result_512_5 = _mm512_setzero_ps();
|
||||
result_512_6 = _mm512_setzero_ps();
|
||||
result_512_7 = _mm512_setzero_ps();
|
||||
result_512_8 = _mm512_setzero_ps();
|
||||
result_512_9 = _mm512_setzero_ps();
|
||||
result_512_10 = _mm512_setzero_ps();
|
||||
result_512_11 = _mm512_setzero_ps();
|
||||
result_512_12 = _mm512_setzero_ps();
|
||||
result_512_13 = _mm512_setzero_ps();
|
||||
result_512_14 = _mm512_setzero_ps();
|
||||
result_512_15 = _mm512_setzero_ps();
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
|
||||
// Load B with unroll 8
|
||||
idxB_base = idx_k << 3;
|
||||
arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
|
||||
arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
|
||||
arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
|
||||
arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
|
||||
arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
|
||||
arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
|
||||
arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
|
||||
arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
|
||||
|
||||
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
|
||||
|
||||
for (BLASLONG idx = 0; idx < width;) {
|
||||
// Each two rows are a group for 32-pair bf16 elements
|
||||
idxA_base = idx << 5;
|
||||
arrayA_512_0 = _mm512_loadu_si512(&A[idxA_base]);
|
||||
arrayA_512_1 = _mm512_loadu_si512(&A[idxA_base + 32]);
|
||||
|
||||
result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
|
||||
result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
|
||||
result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
|
||||
result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
|
||||
result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
|
||||
result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
|
||||
result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
|
||||
result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
|
||||
result_512_8 = _mm512_dpbf16_ps(result_512_8, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
|
||||
result_512_9 = _mm512_dpbf16_ps(result_512_9, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
|
||||
result_512_10 = _mm512_dpbf16_ps(result_512_10, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
|
||||
result_512_11 = _mm512_dpbf16_ps(result_512_11, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
|
||||
result_512_12 = _mm512_dpbf16_ps(result_512_12, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
|
||||
result_512_13 = _mm512_dpbf16_ps(result_512_13, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
|
||||
result_512_14 = _mm512_dpbf16_ps(result_512_14, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
|
||||
result_512_15 = _mm512_dpbf16_ps(result_512_15, (__m512bh) arrayA_512_1, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
|
||||
|
||||
arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
|
||||
|
||||
idx += 2;
|
||||
// Every 4 loops we need to switch to next 128 bits of arrayB registers
|
||||
if ((idx & (~7)) == idx) {
|
||||
arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m != 32) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]), tail_mask)
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]), tail_mask)
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]), tail_mask)
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]), tail_mask)
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]), tail_mask)
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]), tail_mask)
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]), tail_mask)
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]), tail_mask)
|
||||
} else {
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base0, result_512_8);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_0, shuffle_idx_base1, result_512_8);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base0, result_512_9);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_1, shuffle_idx_base1, result_512_9);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*0]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*0+16]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*1]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*1+16]))
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base0, result_512_10);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_2, shuffle_idx_base1, result_512_10);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base0, result_512_11);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_3, shuffle_idx_base1, result_512_11);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*2]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*2+16]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*3]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*3+16]))
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base0, result_512_12);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_4, shuffle_idx_base1, result_512_12);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base0, result_512_13);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_5, shuffle_idx_base1, result_512_13);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*4]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*4+16]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*5]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*5+16]))
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base0, result_512_14);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512_6, shuffle_idx_base1, result_512_14);
|
||||
result_512_tmp_2 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base0, result_512_15);
|
||||
result_512_tmp_3 = _mm512_permutex2var_ps(result_512_7, shuffle_idx_base1, result_512_15);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*6]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*6+16]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_2, (&C[ldc*7]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_3, (&C[ldc*7+16]))
|
||||
}
|
||||
}
|
||||
|
||||
// SBGEMM Kernel for M<=16, N=8, K can be any number, but the processing will take 32 as a base
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_block_kernel_16x8x32_alpha(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_block_kernel_16x8x32_one(BLASLONG m, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#endif
|
||||
{
|
||||
int SHUFFLE_MAGIC_NO = 0x39;
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG idxB_base = 0;
|
||||
BLASLONG width = 32;
|
||||
|
||||
#ifndef ONE_ALPHA
|
||||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
|
||||
__m512i arrayA_512_0;
|
||||
__m512i arrayB_512_0, arrayB_512_1, arrayB_512_2, arrayB_512_3, arrayB_512_4, arrayB_512_5, arrayB_512_6, arrayB_512_7;
|
||||
__m512 result_512_0, result_512_1, result_512_2, result_512_3, result_512_4, result_512_5, result_512_6, result_512_7;
|
||||
|
||||
result_512_0 = _mm512_setzero_ps();
|
||||
result_512_1 = _mm512_setzero_ps();
|
||||
result_512_2 = _mm512_setzero_ps();
|
||||
result_512_3 = _mm512_setzero_ps();
|
||||
result_512_4 = _mm512_setzero_ps();
|
||||
result_512_5 = _mm512_setzero_ps();
|
||||
result_512_6 = _mm512_setzero_ps();
|
||||
result_512_7 = _mm512_setzero_ps();
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
|
||||
// Load B with unroll 8
|
||||
idxB_base = idx_k << 3;
|
||||
arrayB_512_0 = _mm512_loadu_si512(&B[idxB_base + 32*0]);
|
||||
arrayB_512_1 = _mm512_loadu_si512(&B[idxB_base + 32*1]);
|
||||
arrayB_512_2 = _mm512_loadu_si512(&B[idxB_base + 32*2]);
|
||||
arrayB_512_3 = _mm512_loadu_si512(&B[idxB_base + 32*3]);
|
||||
arrayB_512_4 = _mm512_loadu_si512(&B[idxB_base + 32*4]);
|
||||
arrayB_512_5 = _mm512_loadu_si512(&B[idxB_base + 32*5]);
|
||||
arrayB_512_6 = _mm512_loadu_si512(&B[idxB_base + 32*6]);
|
||||
arrayB_512_7 = _mm512_loadu_si512(&B[idxB_base + 32*7]);
|
||||
|
||||
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
|
||||
|
||||
for (BLASLONG idx = 0; idx < width;) {
|
||||
// Each two rows are a group for 32-pair bf16 elements
|
||||
// Load two rows into a 512 register
|
||||
arrayA_512_0 = _mm512_loadu_si512(&A[idx<<4]);
|
||||
|
||||
result_512_0 = _mm512_dpbf16_ps(result_512_0, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_0)));
|
||||
result_512_1 = _mm512_dpbf16_ps(result_512_1, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_1)));
|
||||
result_512_2 = _mm512_dpbf16_ps(result_512_2, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_2)));
|
||||
result_512_3 = _mm512_dpbf16_ps(result_512_3, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_3)));
|
||||
result_512_4 = _mm512_dpbf16_ps(result_512_4, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_4)));
|
||||
result_512_5 = _mm512_dpbf16_ps(result_512_5, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_5)));
|
||||
result_512_6 = _mm512_dpbf16_ps(result_512_6, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_6)));
|
||||
result_512_7 = _mm512_dpbf16_ps(result_512_7, (__m512bh) arrayA_512_0, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512_7)));
|
||||
|
||||
arrayB_512_0 = _mm512_shuffle_epi32(arrayB_512_0, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_1 = _mm512_shuffle_epi32(arrayB_512_1, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_2 = _mm512_shuffle_epi32(arrayB_512_2, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_3 = _mm512_shuffle_epi32(arrayB_512_3, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_4 = _mm512_shuffle_epi32(arrayB_512_4, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_5 = _mm512_shuffle_epi32(arrayB_512_5, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_6 = _mm512_shuffle_epi32(arrayB_512_6, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_7 = _mm512_shuffle_epi32(arrayB_512_7, SHUFFLE_MAGIC_NO);
|
||||
|
||||
idx += 2;
|
||||
// Every 4 loops we need to switch to next 128 bits of arrayB registers
|
||||
if ((idx & (~7)) == idx) {
|
||||
arrayB_512_0 = _mm512_shuffle_i32x4(arrayB_512_0, arrayB_512_0, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_1 = _mm512_shuffle_i32x4(arrayB_512_1, arrayB_512_1, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_2 = _mm512_shuffle_i32x4(arrayB_512_2, arrayB_512_2, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_3 = _mm512_shuffle_i32x4(arrayB_512_3, arrayB_512_3, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_4 = _mm512_shuffle_i32x4(arrayB_512_4, arrayB_512_4, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_5 = _mm512_shuffle_i32x4(arrayB_512_5, arrayB_512_5, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_6 = _mm512_shuffle_i32x4(arrayB_512_6, arrayB_512_6, SHUFFLE_MAGIC_NO);
|
||||
arrayB_512_7 = _mm512_shuffle_i32x4(arrayB_512_7, arrayB_512_7, SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m != 16) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
|
||||
result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
|
||||
result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
|
||||
result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
|
||||
result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_0, (&C[ldc*0]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_1, (&C[ldc*1]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_2, (&C[ldc*2]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_3, (&C[ldc*3]), tail_mask)
|
||||
result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
|
||||
result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
|
||||
result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
|
||||
result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_4, (&C[ldc*4]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_5, (&C[ldc*5]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_6, (&C[ldc*6]), tail_mask)
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_7, (&C[ldc*7]), tail_mask)
|
||||
} else {
|
||||
result_512_0 = _mm512_shuffle_f32x4(result_512_0, result_512_0, 0xd8);
|
||||
result_512_1 = _mm512_shuffle_f32x4(result_512_1, result_512_1, 0xd8);
|
||||
result_512_2 = _mm512_shuffle_f32x4(result_512_2, result_512_2, 0xd8);
|
||||
result_512_3 = _mm512_shuffle_f32x4(result_512_3, result_512_3, 0xd8);
|
||||
STORE16_COMPLETE_RESULT(result_512_0, (&C[ldc*0]))
|
||||
STORE16_COMPLETE_RESULT(result_512_1, (&C[ldc*1]))
|
||||
STORE16_COMPLETE_RESULT(result_512_2, (&C[ldc*2]))
|
||||
STORE16_COMPLETE_RESULT(result_512_3, (&C[ldc*3]))
|
||||
result_512_4 = _mm512_shuffle_f32x4(result_512_4, result_512_4, 0xd8);
|
||||
result_512_5 = _mm512_shuffle_f32x4(result_512_5, result_512_5, 0xd8);
|
||||
result_512_6 = _mm512_shuffle_f32x4(result_512_6, result_512_6, 0xd8);
|
||||
result_512_7 = _mm512_shuffle_f32x4(result_512_7, result_512_7, 0xd8);
|
||||
STORE16_COMPLETE_RESULT(result_512_4, (&C[ldc*4]))
|
||||
STORE16_COMPLETE_RESULT(result_512_5, (&C[ldc*5]))
|
||||
STORE16_COMPLETE_RESULT(result_512_6, (&C[ldc*6]))
|
||||
STORE16_COMPLETE_RESULT(result_512_7, (&C[ldc*7]))
|
||||
}
|
||||
}
|
||||
|
||||
// SBGEMM Kernel for 16<M<=32, N<8, K can be any number, but the processing will take 32 as a base
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_block_kernel_32xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_block_kernel_32xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#endif
|
||||
{
|
||||
int SHUFFLE_MAGIC_NO = 0x39;
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG idxA_base = 0;
|
||||
BLASLONG idxB_base = 0;
|
||||
BLASLONG width = 32;
|
||||
|
||||
#ifndef ONE_ALPHA
|
||||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
|
||||
__m512i arrayA_512[2];
|
||||
__m512i arrayB_512[8];
|
||||
__m512 result_512[16];
|
||||
__m512 result_512_tmp_0, result_512_tmp_1;
|
||||
|
||||
__m512i M512_EPI32_8 = _mm512_set1_epi32(8);
|
||||
__m512i shuffle_idx_base0 = _mm512_set_epi32(23, 22, 21, 20, 7, 6, 5, 4, 19, 18, 17, 16, 3, 2, 1, 0);
|
||||
__m512i shuffle_idx_base1 = _mm512_add_epi32(shuffle_idx_base0, M512_EPI32_8);
|
||||
|
||||
for (int i = 0; i < 15; i += 2) {
|
||||
result_512[i] = _mm512_setzero_ps();
|
||||
result_512[i+1] = _mm512_setzero_ps();
|
||||
}
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
|
||||
// Load B with unroll n
|
||||
for (int i = 0; i < n; i ++) {
|
||||
arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
|
||||
idxB_base += 32;
|
||||
}
|
||||
|
||||
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
|
||||
|
||||
for (BLASLONG idx = 0; idx < width;) {
|
||||
// Each two rows are a group for 32-pair bf16 elements
|
||||
idxA_base = idx << 5;
|
||||
arrayA_512[0] = _mm512_loadu_si512(&A[idxA_base]);
|
||||
arrayA_512[1] = _mm512_loadu_si512(&A[idxA_base + 32]);
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
result_512[i] = _mm512_dpbf16_ps(result_512[i] , (__m512bh) arrayA_512[0], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
|
||||
result_512[i+8] = _mm512_dpbf16_ps(result_512[i+8], (__m512bh) arrayA_512[1], (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
|
||||
arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
|
||||
idx += 2;
|
||||
// Every 4 loops we need to switch to next 128 bits of arrayB registers
|
||||
if ((idx & (~7)) == idx) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m != 32) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (32-m));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
for (int i = 0; i < n; i++) {
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]), tail_mask)
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < n; i++) {
|
||||
result_512_tmp_0 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base0, result_512[i+8]);
|
||||
result_512_tmp_1 = _mm512_permutex2var_ps(result_512[i], shuffle_idx_base1, result_512[i+8]);
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_0, (&C[ldc*i]))
|
||||
STORE16_COMPLETE_RESULT(result_512_tmp_1, (&C[ldc*i+16]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// SBGEMM Kernel for 16<=M, N<8, K can be any number, but the processing will take 32 as a base
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_block_kernel_16xNx32_alpha(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_block_kernel_16xNx32_one(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, bfloat16 *A, bfloat16 *B, float *C, int ldc)
|
||||
#endif
|
||||
{
|
||||
int SHUFFLE_MAGIC_NO = 0x39;
|
||||
BLASLONG tag_k_32x = k & (~31);
|
||||
BLASLONG idxB_base = 0;
|
||||
BLASLONG width = 32;
|
||||
|
||||
#ifndef ONE_ALPHA
|
||||
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
|
||||
#endif
|
||||
|
||||
__m512i arrayA_512;
|
||||
__m512i arrayB_512[8];
|
||||
__m512 result_512[8];
|
||||
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
result_512[i] = _mm512_setzero_ps();
|
||||
result_512[i+1] = _mm512_setzero_ps();
|
||||
}
|
||||
|
||||
for (BLASLONG idx_k = 0; idx_k < k; idx_k += 32) {
|
||||
// Load B with unroll n
|
||||
for (int i = 0; i < n; i ++) {
|
||||
arrayB_512[i] = _mm512_loadu_si512(&B[idxB_base]);
|
||||
idxB_base += 32;
|
||||
}
|
||||
|
||||
if (idx_k == tag_k_32x) {width = k - tag_k_32x;}
|
||||
|
||||
for (BLASLONG idx = 0; idx < width;) {
|
||||
// Each two rows are a group for 32-pair bf16 elements
|
||||
// Load two rows into a 512 register
|
||||
arrayA_512 = _mm512_loadu_si512(&A[idx<<4]);
|
||||
|
||||
for (int i = 0; i < n; i ++) {
|
||||
result_512[i] = _mm512_dpbf16_ps(result_512[i], (__m512bh) arrayA_512, (__m512bh) _mm512_broadcastd_epi32(_mm512_castsi512_si128(arrayB_512[i])));
|
||||
arrayB_512[i] = _mm512_shuffle_epi32(arrayB_512[i], SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
|
||||
idx += 2;
|
||||
// Every 4 loops we need to switch to next 128 bits of arrayB registers
|
||||
if ((idx & (~7)) == idx) {
|
||||
for (int i = 0; i < n; i++) {
|
||||
arrayB_512[i] = _mm512_shuffle_i32x4(arrayB_512[i], arrayB_512[i], SHUFFLE_MAGIC_NO);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (m != 16) {
|
||||
unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-m));
|
||||
__mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
|
||||
for (int i = 0; i < n; i++) {
|
||||
result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
|
||||
STORE16_MASK_COMPLETE_RESULT(result_512[i], (&C[ldc*i]), tail_mask)
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < n; i++) {
|
||||
result_512[i] = _mm512_shuffle_f32x4(result_512[i], result_512[i], 0xd8);
|
||||
STORE16_COMPLETE_RESULT(result_512[i], (&C[ldc*i]))
|
||||
}
|
||||
}
|
||||
}
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_blocking_kernel_2_alpha(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_blocking_kernel_2_one(blasint M, blasint N, blasint K, float alpha, bfloat16 *A, blasint lda, bfloat16 *B, blasint ldb, float *C, blasint ldc, bfloat16 * block_A, bfloat16 * block_B)
|
||||
#endif
|
||||
{
|
||||
BLASLONG m_step, n_step, k_step, k_step_round32;
|
||||
BLASLONG tag_m_Nx = M & (~(BF16_BLOCK_THRES_M-1));
|
||||
|
||||
BLASLONG n_from, n_to;
|
||||
BLASLONG tag_n_Nx;
|
||||
|
||||
n_from = 0;
|
||||
n_to = (BF16_BLOCK_THRES_N > N) ? N : BF16_BLOCK_THRES_N;
|
||||
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
|
||||
|
||||
k_step = (K > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : K;
|
||||
k_step_round32 = k_step & (~31);
|
||||
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
|
||||
|
||||
if (M >= BF16_BLOCK_THRES_M) {
|
||||
while (n_from < N) {
|
||||
for (BLASLONG idx_k = 0; idx_k < K;) {
|
||||
// Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, 0), lda, block_A);
|
||||
// TODO: MT
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
// Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
|
||||
COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
|
||||
SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
|
||||
SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
|
||||
}
|
||||
|
||||
for (BLASLONG idx_m = BF16_BLOCK_THRES_M; idx_m < tag_m_Nx; idx_m += BF16_BLOCK_THRES_M) {
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx32(k_step, &A(idx_k, idx_m), lda, block_A);
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
SBGEMM_BLOCK_KERNEL_32x8x32(32, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, idx_m), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
SBGEMM_BLOCK_KERNEL_32xNx32(32, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, idx_m), ldc);
|
||||
}
|
||||
}
|
||||
|
||||
if (tag_m_Nx != M) {
|
||||
m_step = M - tag_m_Nx;
|
||||
if (m_step > 16) {
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
|
||||
}
|
||||
} else if (m_step == 16) {
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx16(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
|
||||
}
|
||||
} else {
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx16m(k_step, m_step, &A(idx_k, tag_m_Nx), lda, block_A);
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
SBGEMM_BLOCK_KERNEL_16x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, tag_m_Nx), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
SBGEMM_BLOCK_KERNEL_16xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, tag_m_Nx), ldc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
idx_k += k_step;
|
||||
k_step = K - idx_k;
|
||||
k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
|
||||
k_step_round32 = k_step & (~31);
|
||||
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
|
||||
}
|
||||
|
||||
n_from = n_to;
|
||||
n_to += BF16_BLOCK_THRES_N;
|
||||
n_to = (n_to > N) ? N : n_to;
|
||||
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
|
||||
}
|
||||
} else {
|
||||
m_step = M - tag_m_Nx;
|
||||
while (n_from < N) {
|
||||
for (BLASLONG idx_k = 0; idx_k < K;) {
|
||||
// Use Kx32 kernel when BF16_BLOCK_THRES_M==32, Kx16 kernel when BF16_BLOCK_THRES_M==16, ...
|
||||
COL_MAJOR_INCOPY_KERNEL_Kx32m(k_step, m_step, &A(idx_k, 0), lda, block_A);
|
||||
// TODO: MT
|
||||
for (BLASLONG idx_n = n_from; idx_n < tag_n_Nx; idx_n += BF16_BLOCK_STEP_N) {
|
||||
// Use 8x32 kernel when BF16_BLOCK_THRES_N==8, 4x32 kernel when BF16_BLOCK_THRES_N==4, ...
|
||||
COL_MAJOR_ONCOPY_KERNEL_8x32(k_step, &B(idx_n, idx_k), ldb, block_B + (idx_n-n_from)*k_step_round32);
|
||||
SBGEMM_BLOCK_KERNEL_32x8x32(m_step, k_step, alpha, block_A, block_B + (idx_n-n_from)*k_step_round32, &C(idx_n, 0), ldc);
|
||||
}
|
||||
|
||||
if (tag_n_Nx != n_to) {
|
||||
n_step = n_to - tag_n_Nx;
|
||||
COL_MAJOR_ONCOPY_KERNEL_Nx32(n_step, k_step, &B(tag_n_Nx, idx_k), ldb, block_B + (tag_n_Nx-n_from)*k_step_round32);
|
||||
SBGEMM_BLOCK_KERNEL_32xNx32(m_step, n_step, k_step, alpha, block_A, block_B + (tag_n_Nx-n_from)*k_step_round32, &C(tag_n_Nx, 0), ldc);
|
||||
}
|
||||
|
||||
idx_k += k_step;
|
||||
k_step = K - idx_k;
|
||||
k_step = (k_step > BF16_BLOCK_THRES_K) ? BF16_BLOCK_THRES_K : k_step;
|
||||
k_step_round32 = k_step & (~31);
|
||||
k_step_round32 = (k_step > k_step_round32) ? (k_step_round32 + 32) : k_step_round32;
|
||||
}
|
||||
n_from = n_to;
|
||||
n_to += BF16_BLOCK_THRES_N;
|
||||
n_to = (n_to > N) ? N : n_to;
|
||||
tag_n_Nx = n_to & (~(BF16_BLOCK_STEP_N-1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef ONE_ALPHA // ALPHA is not ONE
|
||||
void sbgemm_internal_kernel_alpha(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
|
||||
#else // ALPHA is ONE
|
||||
void sbgemm_internal_kernel_one(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
|
||||
OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, float *C, OPENBLAS_CONST blasint ldc)
|
||||
#endif
|
||||
{
|
||||
bfloat16 block_A[BF16_BLOCK_THRES_K * BF16_BLOCK_THRES_M];
|
||||
bfloat16 block_B[BF16_BLOCK_THRES_N * BF16_BLOCK_THRES_K];
|
||||
|
||||
// TODO: assume no trans for both A and B, to complement these scenarios later
|
||||
if (Order == CblasColMajor) {
|
||||
SBGEMM_BLOCKING_KERNEL_2(M, N, K, alpha, A, lda, B, ldb, C, ldc, block_A, block_B);
|
||||
} else {
|
||||
|
||||
}
|
||||
}
|
|
@ -1,8 +1,11 @@
|
|||
/* the direct sgemm code written by Arjan van der Ven */
|
||||
|
||||
|
||||
#if defined(SKYLAKEX) || defined (COOPERLAKE)
|
||||
|
||||
#include <immintrin.h>
|
||||
#include "common.h"
|
||||
|
||||
#if defined(SKYLAKEX) || defined (COOPERLAKE)
|
||||
/*
|
||||
* "Direct sgemm" code. This code operates directly on the inputs and outputs
|
||||
* of the sgemm call, avoiding the copies, memory realignments and threading,
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
#if defined(SKYLAKEX)
|
||||
#include "srot_microk_skylakex-2.c"
|
||||
#elif defined(HASWELL)
|
||||
#elif defined(HASWELL) || defined(ZEN)
|
||||
#include "srot_microk_haswell-2.c"
|
||||
#endif
|
||||
|
||||
|
@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
|
|||
{
|
||||
BLASLONG i = 0;
|
||||
|
||||
#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
|
||||
#if V_SIMD && !defined(C_PGI) && (defined(HAVE_FMA3) || V_SIMD > 128)
|
||||
const int vstep = v_nlanes_f32;
|
||||
const int unrollx4 = n & (-vstep * 4);
|
||||
const int unrollx = n & -vstep;
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
#if defined(HAVE_FMA3) && defined(HAVE_AVX2)
|
||||
|
||||
#define HAVE_SROT_KERNEL 1
|
||||
|
||||
|
|
|
@ -320,12 +320,13 @@
|
|||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||
COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
||||
$ U12, X
|
||||
$ U12, X, ABI12, Y
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
COMPLEX CLADIV
|
||||
LOGICAL LSAME
|
||||
REAL CLANHS, SLAMCH
|
||||
EXTERNAL LSAME, CLANHS, SLAMCH
|
||||
EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA
|
||||
|
@ -729,22 +730,34 @@
|
|||
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
||||
$ ( BSCALE*T( ILAST, ILAST ) )
|
||||
ABI22 = AD22 - U12*AD21
|
||||
ABI12 = AD12 - U12*AD11
|
||||
*
|
||||
T1 = HALF*( AD11+ABI22 )
|
||||
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
|
||||
TEMP = REAL( T1-ABI22 )*REAL( RTDISC ) +
|
||||
$ AIMAG( T1-ABI22 )*AIMAG( RTDISC )
|
||||
IF( TEMP.LE.ZERO ) THEN
|
||||
SHIFT = T1 + RTDISC
|
||||
ELSE
|
||||
SHIFT = T1 - RTDISC
|
||||
SHIFT = ABI22
|
||||
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
|
||||
TEMP = ABS1( CTEMP )
|
||||
IF( CTEMP.NE.ZERO ) THEN
|
||||
X = HALF*( AD11-SHIFT )
|
||||
TEMP2 = ABS1( X )
|
||||
TEMP = MAX( TEMP, ABS1( X ) )
|
||||
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
|
||||
IF( TEMP2.GT.ZERO ) THEN
|
||||
IF( REAL( X / TEMP2 )*REAL( Y )+
|
||||
$ AIMAG( X / TEMP2 )*AIMAG( Y ).LT.ZERO )Y = -Y
|
||||
END IF
|
||||
SHIFT = SHIFT - CTEMP*CLADIV( CTEMP, ( X+Y ) )
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Exceptional shift. Chosen for no particularly good reason.
|
||||
*
|
||||
ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
|
||||
$ (BSCALE*T(ILAST-1,ILAST-1))
|
||||
IF( ( IITER / 20 )*20.EQ.IITER .AND.
|
||||
$ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
|
||||
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
|
||||
$ ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
|
||||
ELSE
|
||||
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
|
||||
$ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
|
||||
END IF
|
||||
SHIFT = ESHIFT
|
||||
END IF
|
||||
*
|
||||
|
|
|
@ -320,12 +320,13 @@
|
|||
$ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP
|
||||
COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2,
|
||||
$ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1,
|
||||
$ U12, X
|
||||
$ U12, X, ABI12, Y
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
COMPLEX*16 ZLADIV
|
||||
LOGICAL LSAME
|
||||
DOUBLE PRECISION DLAMCH, ZLANHS
|
||||
EXTERNAL LSAME, DLAMCH, ZLANHS
|
||||
EXTERNAL ZLADIV, LSAME, DLAMCH, ZLANHS
|
||||
* ..
|
||||
* .. External Subroutines ..
|
||||
EXTERNAL XERBLA, ZLARTG, ZLASET, ZROT, ZSCAL
|
||||
|
@ -730,22 +731,34 @@
|
|||
AD22 = ( ASCALE*H( ILAST, ILAST ) ) /
|
||||
$ ( BSCALE*T( ILAST, ILAST ) )
|
||||
ABI22 = AD22 - U12*AD21
|
||||
ABI12 = AD12 - U12*AD11
|
||||
*
|
||||
T1 = HALF*( AD11+ABI22 )
|
||||
RTDISC = SQRT( T1**2+AD12*AD21-AD11*AD22 )
|
||||
TEMP = DBLE( T1-ABI22 )*DBLE( RTDISC ) +
|
||||
$ DIMAG( T1-ABI22 )*DIMAG( RTDISC )
|
||||
IF( TEMP.LE.ZERO ) THEN
|
||||
SHIFT = T1 + RTDISC
|
||||
ELSE
|
||||
SHIFT = T1 - RTDISC
|
||||
SHIFT = ABI22
|
||||
CTEMP = SQRT( ABI12 )*SQRT( AD21 )
|
||||
TEMP = ABS1( CTEMP )
|
||||
IF( CTEMP.NE.ZERO ) THEN
|
||||
X = HALF*( AD11-SHIFT )
|
||||
TEMP2 = ABS1( X )
|
||||
TEMP = MAX( TEMP, ABS1( X ) )
|
||||
Y = TEMP*SQRT( ( X / TEMP )**2+( CTEMP / TEMP )**2 )
|
||||
IF( TEMP2.GT.ZERO ) THEN
|
||||
IF( DBLE( X / TEMP2 )*DBLE( Y )+
|
||||
$ DIMAG( X / TEMP2 )*DIMAG( Y ).LT.ZERO )Y = -Y
|
||||
END IF
|
||||
SHIFT = SHIFT - CTEMP*ZLADIV( CTEMP, ( X+Y ) )
|
||||
END IF
|
||||
ELSE
|
||||
*
|
||||
* Exceptional shift. Chosen for no particularly good reason.
|
||||
*
|
||||
ESHIFT = ESHIFT + (ASCALE*H(ILAST,ILAST-1))/
|
||||
$ (BSCALE*T(ILAST-1,ILAST-1))
|
||||
IF( ( IITER / 20 )*20.EQ.IITER .AND.
|
||||
$ BSCALE*ABS1(T( ILAST, ILAST )).GT.SAFMIN ) THEN
|
||||
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
|
||||
$ ILAST ) )/( BSCALE*T( ILAST, ILAST ) )
|
||||
ELSE
|
||||
ESHIFT = ESHIFT + ( ASCALE*H( ILAST,
|
||||
$ ILAST-1 ) )/( BSCALE*T( ILAST-1, ILAST-1 ) )
|
||||
END IF
|
||||
SHIFT = ESHIFT
|
||||
END IF
|
||||
*
|
||||
|
|
|
@ -174,7 +174,20 @@ if(PYTHONINTERP_FOUND)
|
|||
endif()
|
||||
|
||||
|
||||
|
||||
if(WIN32)
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1
|
||||
"if (Test-Path $args[2]) { Remove-Item -Force $args[2] } \n"
|
||||
"$ErrorActionPreference = \"Stop\"\n"
|
||||
"Get-Content $args[1] | & \"$($args[0]).exe\" | Out-File $args[2]\n"
|
||||
"If ((Get-Content $args[2] | %{$_ -match \"FATAL\"}) -contains $true) {\n"
|
||||
"echo Error\n"
|
||||
"exit 1\n"
|
||||
"} else {\n"
|
||||
"exit 0\n"
|
||||
"}\n"
|
||||
)
|
||||
set(helper_prefix powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_helper.ps1")
|
||||
else()
|
||||
# $1 exec, $2 input, $3 output_result
|
||||
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
|
||||
"rm -f $3\n"
|
||||
|
@ -187,51 +200,52 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh
|
|||
"exit 0\n"
|
||||
"fi\n"
|
||||
)
|
||||
|
||||
set(helper_prefix sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh")
|
||||
endif()
|
||||
|
||||
add_test(NAME "REAL_LAPACK_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest.in" "${CMAKE_CURRENT_BINARY_DIR}/stest.out"
|
||||
)
|
||||
add_test(NAME "COMPLEX_LAPACK_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest.in" "${CMAKE_CURRENT_BINARY_DIR}/ctest.out"
|
||||
)
|
||||
add_test(NAME "DOUBLE_PRECISION_LAPACK_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest.in" "${CMAKE_CURRENT_BINARY_DIR}/dtest.out"
|
||||
)
|
||||
add_test(NAME "COMPLEX16_LAPACK_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN//xlintstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest.in" "${CMAKE_CURRENT_BINARY_DIR}/ztest.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SINGLE-DOUBLE_PRECISION_LAPACK_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstds" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dstest.in" " ${CMAKE_CURRENT_BINARY_DIR}/dstest.out"
|
||||
)
|
||||
# ======== COMPLEX-COMPLEX16 LIN TESTS ========================
|
||||
|
||||
add_test(NAME "Testing_COMPLEX-COMPLEX16_LAPACK_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstzc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zctest.in" " ${CMAKE_CURRENT_BINARY_DIR}/zctest.out"
|
||||
)
|
||||
|
||||
# ======== SINGLE RFP LIN TESTS ========================
|
||||
|
||||
add_test(NAME "Testing_REAL_LAPACK_RFP_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfs" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/stest_rfp.in" "${CMAKE_CURRENT_BINARY_DIR}/stest_rfp.out"
|
||||
)
|
||||
|
||||
# ======== COMPLEX16 RFP LIN TESTS ========================
|
||||
|
||||
add_test(NAME "Testing_DOUBLE_PRECISION_LAPACK_RFP_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dtest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/dtest_rfp.out"
|
||||
)
|
||||
# ======== COMPLEX16 RFP LIN TESTS ========================
|
||||
|
||||
add_test(NAME "Testing_COMPLEX_LAPACK_RFP_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ctest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ctest_rfp.out"
|
||||
)
|
||||
|
||||
# ======== COMPLEX16 RFP LIN TESTS ========================
|
||||
|
||||
add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/LIN/xlintstrfz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ztest_rfp.in" " ${CMAKE_CURRENT_BINARY_DIR}/ztest_rfp.out"
|
||||
)
|
||||
#
|
||||
#
|
||||
|
@ -239,327 +253,327 @@ add_test(NAME "Testing_COMPLEX16_LAPACK_RFP_prototype_linear_equation_routines"
|
|||
#
|
||||
|
||||
add_test(NAME "SNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/snep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/sse2.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSVD:_Testing_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssvd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSEC:_Testing_REAL_Eigen_Condition_Routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sec.in" " ${CMAKE_CURRENT_BINARY_DIR}/sec.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSEV:_Testing_REAL_Nonsymmetric_Eigenvalue_Driver"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sed.in" " ${CMAKE_CURRENT_BINARY_DIR}/sed.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGG:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGD:_Testing_REAL_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSB:_Testing_REAL_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssb.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SSG:_Testing_REAL_Symmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ssg.in" " ${CMAKE_CURRENT_BINARY_DIR}/ssg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGEBAL:_Testing_the_balancing_of_a_REAL_general_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGEBAK:_Testing_the_back_transformation_of_a_REAL_balanced_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGGBAL:_Testing_the_balancing_of_a_pair_of_REAL_general_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGGBAK:_Testing_the_back_transformation_of_a_pair_of_REAL_balanced_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SBB:_Testing_banded_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/sbb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGLM:_Testing_Generalized_Linear_Regression_Model_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/sglm.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/sgqr.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" "${CMAKE_CURRENT_BINARY_DIR}/sgsv.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SCSD:_Testing_CS_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/scsd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "SLSE:_Testing_Constrained_Linear_Least_Squares_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtsts" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/slse.out"
|
||||
)
|
||||
|
||||
# ======== COMPLEX EIG TESTS ===========================
|
||||
|
||||
add_test(NAME "CNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/cnep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/csep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/cse2.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CSVD:_Testing_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/csvd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CEC:_Testing_COMPLEX_Eigen_Condition_Routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cec.in" " ${CMAKE_CURRENT_BINARY_DIR}/cec.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CES:_Testing_COMPLEX_Nonsymmetric_Schur_Form_Driver"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ced.in" " ${CMAKE_CURRENT_BINARY_DIR}/ced.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGG:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGD:_Testing_COMPLEX_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csb.in" " ${CMAKE_CURRENT_BINARY_DIR}/csb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csg.in" " ${CMAKE_CURRENT_BINARY_DIR}/csg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGEBAL:_Testing_the_balancing_of_a_COMPLEX_general_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGEBAK:_Testing_the_back_transformation_of_a_COMPLEX_balanced_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX_balanced_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CBB:_Testing_banded_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/cbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/cbb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGLM:_Testing_Generalized_Linear_Regression_Model_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/cglm.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgqr.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/cgsv.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CCSD:_Testing_CS_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/ccsd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "CLSE:_Testing_Constrained_Linear_Least_Squares_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstc" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/clse.out"
|
||||
)
|
||||
|
||||
# ======== DOUBLE EIG TESTS ===========================
|
||||
|
||||
add_test(NAME "DNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dnep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/dse2.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DSVD:_Testing_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsvd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DEC:_Testing_DOUBLE_PRECISION_Eigen_Condition_Routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dec.in" " ${CMAKE_CURRENT_BINARY_DIR}/dec.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DEV:_Testing_DOUBLE_PRECISION_Nonsymmetric_Eigenvalue_Driver"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/ded.in" " ${CMAKE_CURRENT_BINARY_DIR}/ded.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGG:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGD:_Testing_DOUBLE_PRECISION_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DSB:_Testing_DOUBLE_PRECISION_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DSG:_Testing_DOUBLE_PRECISION_Symmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/dsg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGEBAL:_Testing_the_balancing_of_a_DOUBLE_PRECISION_general_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGEBAK:_Testing_the_back_transformation_of_a_DOUBLE_PRECISION_balanced_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGGBAL:_Testing_the_balancing_of_a_pair_of_DOUBLE_PRECISION_general_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGGBAK:_Testing_the_back_transformation_of_a_pair_of_DOUBLE_PRECISION_balanced_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DBB:_Testing_banded_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/dbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/dbb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGLM:_Testing_Generalized_Linear_Regression_Model_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/dglm.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgqr.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/dgsv.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DCSD:_Testing_CS_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/dcsd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "DLSE:_Testing_Constrained_Linear_Least_Squares_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstd" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/dlse.out"
|
||||
)
|
||||
|
||||
# ======== COMPLEX16 EIG TESTS ===========================
|
||||
|
||||
add_test(NAME "ZNEP:_Testing_Nonsymmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/nep.in" " ${CMAKE_CURRENT_BINARY_DIR}/znep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZSEP:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/sep.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsep.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZSE2:_Testing_Symmetric_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/se2.in" " ${CMAKE_CURRENT_BINARY_DIR}/zse2.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZSVD:_Testing_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/svd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsvd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZEC:_Testing_COMPLEX16_Eigen_Condition_Routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zec.in" " ${CMAKE_CURRENT_BINARY_DIR}/zec.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZES:_Testing_COMPLEX16_Nonsymmetric_Schur_Form_Driver"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zed.in" " ${CMAKE_CURRENT_BINARY_DIR}/zed.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGG:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGD:_Testing_COMPLEX16_Nonsymmetric_Generalized_Eigenvalue_Problem_driver_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZHB:_Testing_Hermitian_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZSG:_Testing_Symmetric_Generalized_Eigenvalue_Problem_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zsg.in" " ${CMAKE_CURRENT_BINARY_DIR}/zsg.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGEBAL:_Testing_the_balancing_of_a_COMPLEX16_general_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGEBAK:_Testing_the_back_transformation_of_a_COMPLEX16_balanced_matrix"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGGBAL:_Testing_the_balancing_of_a_pair_of_COMPLEX_general_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbal.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbal.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGGBAK:_Testing_the_back_transformation_of_a_pair_of_COMPLEX16_balanced_matrices"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zgbak.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgbak.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZBB:_Testing_banded_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/zbb.in" " ${CMAKE_CURRENT_BINARY_DIR}/zbb.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGLM:_Testing_Generalized_Linear_Regression_Model_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/glm.in" " ${CMAKE_CURRENT_BINARY_DIR}/zglm.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGQR:_Testing_Generalized_QR_and_RQ_factorization_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gqr.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgqr.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZGSV:_Testing_Generalized_Singular_Value_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/gsv.in" " ${CMAKE_CURRENT_BINARY_DIR}/zgsv.out"
|
||||
)
|
||||
|
||||
add_test(NAME "ZCSD:_Testing_CS_Decomposition_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/csd.in" " ${CMAKE_CURRENT_BINARY_DIR}/zcsd.out"
|
||||
)
|
||||
|
||||
add_test(NAME "Constrained_Linear_Least_Squares_routines"
|
||||
COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
|
||||
COMMAND ${helper_prefix} "${CMAKE_CURRENT_BINARY_DIR}/EIG/xeigtstz" "${PROJECT_SOURCE_DIR}/lapack-netlib/TESTING/lse.in" " ${CMAKE_CURRENT_BINARY_DIR}/zlse.out"
|
||||
)
|
||||
|
|
|
@ -25,7 +25,7 @@ set(AEIGTST
|
|||
set(SCIGTST slafts.f slahd2.f slasum.f slatb9.f sstech.f sstect.f
|
||||
ssvdch.f ssvdct.f ssxt1.f)
|
||||
|
||||
set(SEIGTST schkee.f
|
||||
set(SEIGTST schkee.F
|
||||
sbdt01.f sbdt02.f sbdt03.f sbdt04.f sbdt05.f
|
||||
schkbb.f schkbd.f schkbk.f schkbl.f schkec.f
|
||||
schkgg.f schkgk.f schkgl.f schkhs.f schksb.f schkst.f schkst2stg.f schksb2stg.f
|
||||
|
@ -42,7 +42,7 @@ set(SEIGTST schkee.f
|
|||
sort03.f ssbt21.f ssgt01.f sslect.f sspt21.f sstt21.f
|
||||
sstt22.f ssyt21.f ssyt22.f)
|
||||
|
||||
set(CEIGTST cchkee.f
|
||||
set(CEIGTST cchkee.F
|
||||
cbdt01.f cbdt02.f cbdt03.f cbdt05.f
|
||||
cchkbb.f cchkbd.f cchkbk.f cchkbl.f cchkec.f
|
||||
cchkgg.f cchkgk.f cchkgl.f cchkhb.f cchkhs.f cchkst.f cchkst2stg.f cchkhb2stg.f
|
||||
|
@ -62,7 +62,7 @@ set(CEIGTST cchkee.f
|
|||
set(DZIGTST dlafts.f dlahd2.f dlasum.f dlatb9.f dstech.f dstect.f
|
||||
dsvdch.f dsvdct.f dsxt1.f)
|
||||
|
||||
set(DEIGTST dchkee.f
|
||||
set(DEIGTST dchkee.F
|
||||
dbdt01.f dbdt02.f dbdt03.f dbdt04.f dbdt05.f
|
||||
dchkbb.f dchkbd.f dchkbk.f dchkbl.f dchkec.f
|
||||
dchkgg.f dchkgk.f dchkgl.f dchkhs.f dchksb.f dchkst.f dchkst2stg.f dchksb2stg.f
|
||||
|
@ -79,7 +79,7 @@ set(DEIGTST dchkee.f
|
|||
dort03.f dsbt21.f dsgt01.f dslect.f dspt21.f dstt21.f
|
||||
dstt22.f dsyt21.f dsyt22.f)
|
||||
|
||||
set(ZEIGTST zchkee.f
|
||||
set(ZEIGTST zchkee.F
|
||||
zbdt01.f zbdt02.f zbdt03.f zbdt05.f
|
||||
zchkbb.f zchkbd.f zchkbk.f zchkbl.f zchkec.f
|
||||
zchkgg.f zchkgk.f zchkgl.f zchkhb.f zchkhs.f zchkst.f zchkst2stg.f zchkhb2stg.f
|
||||
|
|
|
@ -157,11 +157,11 @@ cleanobj:
|
|||
cleanexe:
|
||||
rm -f xeigtst*
|
||||
|
||||
schkee.o: schkee.f
|
||||
schkee.o: schkee.F
|
||||
$(FC) $(FFLAGS_DRV) -c -o $@ $<
|
||||
dchkee.o: dchkee.f
|
||||
dchkee.o: dchkee.F
|
||||
$(FC) $(FFLAGS_DRV) -c -o $@ $<
|
||||
cchkee.o: cchkee.f
|
||||
cchkee.o: cchkee.F
|
||||
$(FC) $(FFLAGS_DRV) -c -o $@ $<
|
||||
zchkee.o: zchkee.f
|
||||
zchkee.o: zchkee.F
|
||||
$(FC) $(FFLAGS_DRV) -c -o $@ $<
|
||||
|
|
|
@ -1034,6 +1034,10 @@
|
|||
* =====================================================================
|
||||
PROGRAM CCHKEE
|
||||
*
|
||||
#if defined(_OPENMP)
|
||||
use omp_lib
|
||||
#endif
|
||||
*
|
||||
* -- LAPACK test routine (version 3.7.0) --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
|
@ -1071,7 +1075,7 @@
|
|||
CHARACTER*80 LINE
|
||||
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
|
||||
$ NK, NN, NPARMS, NRHS, NTYPES,
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
|
||||
REAL EPS, S1, S2, THRESH, THRSHN
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
|
@ -1084,12 +1088,16 @@
|
|||
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
|
||||
$ ISHFTS( MAXIN ), IACC22( MAXIN )
|
||||
REAL ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
|
||||
$ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
|
||||
COMPLEX A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
|
||||
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
|
||||
$ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
|
||||
$ RESULT( 500 )
|
||||
COMPLEX DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
|
||||
$ X( 5*NMAX )
|
||||
* ..
|
||||
* .. Allocatable Arrays ..
|
||||
INTEGER AllocateStatus
|
||||
REAL, DIMENSION(:), ALLOCATABLE :: RWORK, S
|
||||
COMPLEX, DIMENSION(:), ALLOCATABLE :: WORK
|
||||
COMPLEX, DIMENSION(:,:), ALLOCATABLE :: A, B, C
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAMEN
|
||||
REAL SECOND, SLAMCH
|
||||
|
@ -1130,6 +1138,21 @@
|
|||
DATA INTSTR / '0123456789' /
|
||||
DATA IOLDSD / 0, 0, 0, 1 /
|
||||
* ..
|
||||
* .. Allocate memory dynamically ..
|
||||
*
|
||||
ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
A = 0.0
|
||||
|
@ -1846,8 +1869,16 @@
|
|||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
CALL XLAENV( 1, 1 )
|
||||
CALL XLAENV( 9, 25 )
|
||||
IF( TSTERR )
|
||||
$ CALL CERRST( 'CST', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL CERRST( 'CST', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
DO 290 I = 1, NPARMS
|
||||
CALL XLAENV( 1, NBVAL( I ) )
|
||||
CALL XLAENV( 2, NBMIN( I ) )
|
||||
|
@ -2305,8 +2336,16 @@
|
|||
MAXTYP = 15
|
||||
NTYPES = MIN( MAXTYP, NTYPES )
|
||||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
IF( TSTERR )
|
||||
$ CALL CERRST( 'CHB', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL CERRST( 'CHB', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
* CALL CCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
|
||||
* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
|
||||
* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
|
||||
|
@ -2437,6 +2476,13 @@
|
|||
WRITE( NOUT, FMT = 9994 )
|
||||
S2 = SECOND( )
|
||||
WRITE( NOUT, FMT = 9993 )S2 - S1
|
||||
*
|
||||
DEALLOCATE (S, STAT = AllocateStatus)
|
||||
DEALLOCATE (A, STAT = AllocateStatus)
|
||||
DEALLOCATE (B, STAT = AllocateStatus)
|
||||
DEALLOCATE (C, STAT = AllocateStatus)
|
||||
DEALLOCATE (RWORK, STAT = AllocateStatus)
|
||||
DEALLOCATE (WORK, STAT = AllocateStatus)
|
||||
*
|
||||
9999 FORMAT( / ' Execution not attempted due to input errors' )
|
||||
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
|
|
@ -1040,6 +1040,10 @@
|
|||
* =====================================================================
|
||||
PROGRAM DCHKEE
|
||||
*
|
||||
#if defined(_OPENMP)
|
||||
use omp_lib
|
||||
#endif
|
||||
*
|
||||
* -- LAPACK test routine (version 3.7.0) --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
|
@ -1077,7 +1081,7 @@
|
|||
CHARACTER*80 LINE
|
||||
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
|
||||
$ NK, NN, NPARMS, NRHS, NTYPES,
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
|
||||
DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
|
@ -1089,10 +1093,13 @@
|
|||
$ PVAL( MAXIN )
|
||||
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
|
||||
$ ISHFTS( MAXIN ), IACC22( MAXIN )
|
||||
DOUBLE PRECISION A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
|
||||
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
|
||||
$ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
|
||||
$ WORK( LWORK ), X( 5*NMAX )
|
||||
DOUBLE PRECISION D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
|
||||
$ TAUB( NMAX ), X( 5*NMAX )
|
||||
* ..
|
||||
* .. Allocatable Arrays ..
|
||||
INTEGER AllocateStatus
|
||||
DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: WORK
|
||||
DOUBLE PRECISION, DIMENSION(:,:), ALLOCATABLE :: A, B, C
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAMEN
|
||||
|
@ -1134,6 +1141,17 @@
|
|||
DATA INTSTR / '0123456789' /
|
||||
DATA IOLDSD / 0, 0, 0, 1 /
|
||||
* ..
|
||||
* .. Allocate memory dynamically ..
|
||||
*
|
||||
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
A = 0.0
|
||||
|
@ -1856,8 +1874,16 @@
|
|||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
CALL XLAENV( 1, 1 )
|
||||
CALL XLAENV( 9, 25 )
|
||||
IF( TSTERR )
|
||||
$ CALL DERRST( 'DST', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL DERRST( 'DST', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
DO 290 I = 1, NPARMS
|
||||
CALL XLAENV( 1, NBVAL( I ) )
|
||||
CALL XLAENV( 2, NBMIN( I ) )
|
||||
|
@ -2437,6 +2463,11 @@
|
|||
WRITE( NOUT, FMT = 9994 )
|
||||
S2 = DSECND( )
|
||||
WRITE( NOUT, FMT = 9993 )S2 - S1
|
||||
*
|
||||
DEALLOCATE (A, STAT = AllocateStatus)
|
||||
DEALLOCATE (B, STAT = AllocateStatus)
|
||||
DEALLOCATE (C, STAT = AllocateStatus)
|
||||
DEALLOCATE (WORK, STAT = AllocateStatus)
|
||||
*
|
||||
9999 FORMAT( / ' Execution not attempted due to input errors' )
|
||||
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
|
|
@ -1040,6 +1040,10 @@
|
|||
* =====================================================================
|
||||
PROGRAM SCHKEE
|
||||
*
|
||||
#if defined(_OPENMP)
|
||||
use omp_lib
|
||||
#endif
|
||||
*
|
||||
* -- LAPACK test routine (version 3.7.0) --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
|
@ -1077,7 +1081,7 @@
|
|||
CHARACTER*80 LINE
|
||||
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
|
||||
$ NK, NN, NPARMS, NRHS, NTYPES,
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
|
||||
REAL EPS, S1, S2, THRESH, THRSHN
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
|
@ -1089,10 +1093,13 @@
|
|||
$ PVAL( MAXIN )
|
||||
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
|
||||
$ ISHFTS( MAXIN ), IACC22( MAXIN )
|
||||
REAL A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
|
||||
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), D( NMAX, 12 ),
|
||||
$ RESULT( 500 ), TAUA( NMAX ), TAUB( NMAX ),
|
||||
$ WORK( LWORK ), X( 5*NMAX )
|
||||
REAL D( NMAX, 12 ), RESULT( 500 ), TAUA( NMAX ),
|
||||
$ TAUB( NMAX ), X( 5*NMAX )
|
||||
* ..
|
||||
* .. Allocatable Arrays ..
|
||||
INTEGER AllocateStatus
|
||||
REAL, DIMENSION(:), ALLOCATABLE :: WORK
|
||||
REAL, DIMENSION(:,:), ALLOCATABLE :: A, B, C
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAMEN
|
||||
|
@ -1134,6 +1141,17 @@
|
|||
DATA INTSTR / '0123456789' /
|
||||
DATA IOLDSD / 0, 0, 0, 1 /
|
||||
* ..
|
||||
* .. Allocate memory dynamically ..
|
||||
*
|
||||
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
A = 0.0
|
||||
|
@ -1857,8 +1875,16 @@
|
|||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
CALL XLAENV( 1, 1 )
|
||||
CALL XLAENV( 9, 25 )
|
||||
IF( TSTERR )
|
||||
$ CALL SERRST( 'SST', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL SERRST( 'SST', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
DO 290 I = 1, NPARMS
|
||||
CALL XLAENV( 1, NBVAL( I ) )
|
||||
CALL XLAENV( 2, NBMIN( I ) )
|
||||
|
@ -2440,6 +2466,11 @@
|
|||
WRITE( NOUT, FMT = 9994 )
|
||||
S2 = SECOND( )
|
||||
WRITE( NOUT, FMT = 9993 )S2 - S1
|
||||
*
|
||||
DEALLOCATE (A, STAT = AllocateStatus)
|
||||
DEALLOCATE (B, STAT = AllocateStatus)
|
||||
DEALLOCATE (C, STAT = AllocateStatus)
|
||||
DEALLOCATE (WORK, STAT = AllocateStatus)
|
||||
*
|
||||
9999 FORMAT( / ' Execution not attempted due to input errors' )
|
||||
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
|
|
@ -1034,6 +1034,10 @@
|
|||
* =====================================================================
|
||||
PROGRAM ZCHKEE
|
||||
*
|
||||
#if defined(_OPENMP)
|
||||
use omp_lib
|
||||
#endif
|
||||
*
|
||||
* -- LAPACK test routine (version 3.7.0) --
|
||||
* -- LAPACK is a software package provided by Univ. of Tennessee, --
|
||||
* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
|
||||
|
@ -1071,7 +1075,7 @@
|
|||
CHARACTER*80 LINE
|
||||
INTEGER I, I1, IC, INFO, ITMP, K, LENP, MAXTYP, NEWSD,
|
||||
$ NK, NN, NPARMS, NRHS, NTYPES,
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH
|
||||
$ VERS_MAJOR, VERS_MINOR, VERS_PATCH, N_THREADS
|
||||
DOUBLE PRECISION EPS, S1, S2, THRESH, THRSHN
|
||||
* ..
|
||||
* .. Local Arrays ..
|
||||
|
@ -1084,12 +1088,16 @@
|
|||
INTEGER INMIN( MAXIN ), INWIN( MAXIN ), INIBL( MAXIN ),
|
||||
$ ISHFTS( MAXIN ), IACC22( MAXIN )
|
||||
DOUBLE PRECISION ALPHA( NMAX ), BETA( NMAX ), DR( NMAX, 12 ),
|
||||
$ RESULT( 500 ), RWORK( LWORK ), S( NMAX*NMAX )
|
||||
COMPLEX*16 A( NMAX*NMAX, NEED ), B( NMAX*NMAX, 5 ),
|
||||
$ C( NCMAX*NCMAX, NCMAX*NCMAX ), DC( NMAX, 6 ),
|
||||
$ TAUA( NMAX ), TAUB( NMAX ), WORK( LWORK ),
|
||||
$ RESULT( 500 )
|
||||
COMPLEX*16 DC( NMAX, 6 ), TAUA( NMAX ), TAUB( NMAX ),
|
||||
$ X( 5*NMAX )
|
||||
* ..
|
||||
* .. Allocatable Arrays ..
|
||||
INTEGER AllocateStatus
|
||||
DOUBLE PRECISION, DIMENSION(:), ALLOCATABLE :: RWORK, S
|
||||
COMPLEX*16, DIMENSION(:), ALLOCATABLE :: WORK
|
||||
COMPLEX*16, DIMENSION(:,:), ALLOCATABLE :: A, B, C
|
||||
* ..
|
||||
* .. External Functions ..
|
||||
LOGICAL LSAMEN
|
||||
DOUBLE PRECISION DLAMCH, DSECND
|
||||
|
@ -1130,6 +1138,21 @@
|
|||
DATA INTSTR / '0123456789' /
|
||||
DATA IOLDSD / 0, 0, 0, 1 /
|
||||
* ..
|
||||
* .. Allocate memory dynamically ..
|
||||
*
|
||||
ALLOCATE ( S(NMAX*NMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( A(NMAX*NMAX,NEED), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( B(NMAX*NMAX,5), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( C(NCMAX*NCMAX,NCMAX*NCMAX), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( RWORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
ALLOCATE ( WORK(LWORK), STAT = AllocateStatus )
|
||||
IF (AllocateStatus /= 0) STOP "*** Not enough memory ***"
|
||||
* ..
|
||||
* .. Executable Statements ..
|
||||
*
|
||||
A = 0.0
|
||||
|
@ -1846,8 +1869,16 @@
|
|||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
CALL XLAENV( 1, 1 )
|
||||
CALL XLAENV( 9, 25 )
|
||||
IF( TSTERR )
|
||||
$ CALL ZERRST( 'ZST', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL ZERRST( 'ZST', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
DO 290 I = 1, NPARMS
|
||||
CALL XLAENV( 1, NBVAL( I ) )
|
||||
CALL XLAENV( 2, NBMIN( I ) )
|
||||
|
@ -2303,8 +2334,16 @@
|
|||
MAXTYP = 15
|
||||
NTYPES = MIN( MAXTYP, NTYPES )
|
||||
CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT )
|
||||
IF( TSTERR )
|
||||
$ CALL ZERRST( 'ZHB', NOUT )
|
||||
IF( TSTERR ) THEN
|
||||
#if defined(_OPENMP)
|
||||
N_THREADS = OMP_GET_NUM_THREADS()
|
||||
CALL OMP_SET_NUM_THREADS(1)
|
||||
#endif
|
||||
CALL ZERRST( 'ZHB', NOUT )
|
||||
#if defined(_OPENMP)
|
||||
CALL OMP_SET_NUM_THREADS(N_THREADS)
|
||||
#endif
|
||||
END IF
|
||||
* CALL ZCHKHB( NN, NVAL, NK, KVAL, MAXTYP, DOTYPE, ISEED, THRESH,
|
||||
* $ NOUT, A( 1, 1 ), NMAX, DR( 1, 1 ), DR( 1, 2 ),
|
||||
* $ A( 1, 2 ), NMAX, WORK, LWORK, RWORK, RESULT,
|
||||
|
@ -2435,6 +2474,13 @@
|
|||
WRITE( NOUT, FMT = 9994 )
|
||||
S2 = DSECND( )
|
||||
WRITE( NOUT, FMT = 9993 )S2 - S1
|
||||
*
|
||||
DEALLOCATE (S, STAT = AllocateStatus)
|
||||
DEALLOCATE (A, STAT = AllocateStatus)
|
||||
DEALLOCATE (B, STAT = AllocateStatus)
|
||||
DEALLOCATE (C, STAT = AllocateStatus)
|
||||
DEALLOCATE (RWORK, STAT = AllocateStatus)
|
||||
DEALLOCATE (WORK, STAT = AllocateStatus)
|
||||
*
|
||||
9999 FORMAT( / ' Execution not attempted due to input errors' )
|
||||
9997 FORMAT( / / 1X, A3, ': NB =', I4, ', NBMIN =', I4, ', NX =', I4 )
|
126
param.h
126
param.h
|
@ -72,6 +72,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef PARAM_H
|
||||
#define PARAM_H
|
||||
|
||||
#define LONGCAST (BLASLONG)
|
||||
#if defined(__BYTE_ORDER__)
|
||||
#if __GNUC__ < 9
|
||||
#undef LONGCAST
|
||||
#define LONGCAST
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define SBGEMM_DEFAULT_UNROLL_N 4
|
||||
#define SBGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SBGEMM_DEFAULT_UNROLL_MN 32
|
||||
|
@ -85,7 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 256
|
||||
#define GEMM_DEFAULT_ALIGN 0x01ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -157,7 +165,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 832
|
||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -237,7 +245,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 832
|
||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
|
||||
|
||||
|
||||
|
||||
|
@ -330,7 +338,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 832
|
||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
|
||||
|
||||
|
||||
|
||||
|
@ -422,7 +430,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 832
|
||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
|
||||
|
||||
|
||||
|
||||
|
@ -515,7 +523,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 832
|
||||
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0fffUL
|
||||
|
||||
|
||||
|
||||
|
@ -607,7 +615,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -726,7 +734,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 384
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -774,7 +782,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 256
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -821,7 +829,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 256
|
||||
#define GEMM_DEFAULT_ALIGN 0x01ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x01ffffUL
|
||||
|
||||
#ifdef ARCH_X86
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -890,7 +898,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#ifdef HAVE_SSE
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
|
@ -945,7 +953,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#ifdef CORE_YONAH
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
|
@ -1011,7 +1019,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 32
|
||||
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1068,7 +1076,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 256
|
||||
#endif
|
||||
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1128,7 +1136,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 448
|
||||
#define GEMM_DEFAULT_OFFSET_B 128
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1201,7 +1209,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 128
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1272,7 +1280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 128
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1344,7 +1352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 32
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1417,7 +1425,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1510,7 +1518,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1636,7 +1644,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1877,7 +1885,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 64
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SYMV_P 8
|
||||
|
||||
|
@ -1939,7 +1947,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 128
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
@ -1993,7 +2001,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 512
|
||||
#define GEMM_DEFAULT_OFFSET_B 512
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2061,7 +2069,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 8192
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2088,7 +2096,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef PPCG4
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 1024
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2119,7 +2127,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 2688
|
||||
#define GEMM_DEFAULT_OFFSET_B 3072
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
|
||||
|
||||
#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
|
@ -2168,7 +2176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
|
||||
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2204,7 +2212,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A (32 * 0)
|
||||
#define GEMM_DEFAULT_OFFSET_B (32 * 0)
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2239,7 +2247,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#if defined(POWER3) || defined(POWER4) || defined(POWER5)
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 2048
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2312,7 +2320,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 384
|
||||
#define GEMM_DEFAULT_OFFSET_B 1024
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2344,7 +2352,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
#if defined(__32BIT__)
|
||||
#warning using BINARY32==POWER6
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
|
@ -2397,7 +2406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SWITCH_RATIO 16
|
||||
#define GEMM_PREFERED_SIZE 16
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
@ -2433,24 +2445,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 65536
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN LONGCAST 0x0ffffUL
|
||||
|
||||
#define SWITCH_RATIO 16
|
||||
#define GEMM_PREFERED_SIZE 16
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#else
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||
#endif
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 832
|
||||
#define DGEMM_DEFAULT_P 320
|
||||
#define SGEMM_DEFAULT_P 512
|
||||
#define DGEMM_DEFAULT_P 384
|
||||
#define CGEMM_DEFAULT_P 512
|
||||
#define ZGEMM_DEFAULT_P 256
|
||||
|
||||
#define SGEMM_DEFAULT_Q 1026
|
||||
#define DGEMM_DEFAULT_Q 960
|
||||
#define SGEMM_DEFAULT_Q 512
|
||||
#define DGEMM_DEFAULT_Q 512
|
||||
#define CGEMM_DEFAULT_Q 1026
|
||||
#define ZGEMM_DEFAULT_Q 1026
|
||||
|
||||
|
@ -2480,7 +2500,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 2048
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
@ -2512,7 +2532,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 2048
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2543,7 +2563,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
|
@ -2578,7 +2598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#ifdef HAVE_MSA
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
|
@ -2634,7 +2654,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2675,7 +2695,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG) 0x03fffUL
|
||||
|
||||
#ifdef HAVE_MSA
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
|
@ -2724,7 +2744,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef RISCV64_GENERIC
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -2805,7 +2825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -2846,7 +2866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -3121,7 +3141,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -3162,7 +3182,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -3203,7 +3223,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -3244,7 +3264,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
|
@ -3283,7 +3303,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
@ -3365,7 +3385,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
|
||||
#define GEMM_DEFAULT_ALIGN (BLASLONG)0x0ffffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue