Merge branch 'develop' into release-0.3.0

This commit is contained in:
Martin Kroeker 2023-03-26 23:34:17 +02:00 committed by GitHub
commit e46971b9d5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2603 changed files with 155622 additions and 112440 deletions

View File

@ -2,6 +2,9 @@ name: continuous build
on: [push, pull_request]
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
runs-on: ${{ matrix.os }}
@ -34,7 +37,7 @@ jobs:
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
@ -150,6 +153,7 @@ jobs:
matrix:
msystem: [MINGW64, MINGW32, CLANG64]
idx: [int32, int64]
build-type: [Release]
include:
- msystem: MINGW64
idx: int32
@ -173,6 +177,11 @@ jobs:
idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON
- msystem: MINGW64
idx: int32
target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran
build-type: None
exclude:
- msystem: MINGW32
idx: int64
@ -215,11 +224,11 @@ jobs:
path: C:/msys64/home/runneradmin/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}-${{ github.sha }}
key: ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch.
restore-keys: |
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ github.ref }}
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}-${{ github.ref }}
ccache-msys2-${{ matrix.msystem }}-${{ matrix.idx }}-${{ matrix.build-type }}
- name: Configure ccache
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota.
@ -235,7 +244,8 @@ jobs:
- name: Configure OpenBLAS
run: |
mkdir build && cd build
cmake -DBUILD_SHARED_LIBS=ON \
cmake -DCMAKE_BUILD_TYPE=${{ matrix.build-type }} \
-DBUILD_SHARED_LIBS=ON \
-DBUILD_STATIC_LIBS=ON \
-DDYNAMIC_ARCH=ON \
-DUSE_THREAD=ON \
@ -257,3 +267,54 @@ jobs:
- name: Run tests
timeout-minutes: 60
run: cd build && ctest
cross_build:
runs-on: ubuntu-22.04
strategy:
fail-fast: false
matrix:
include:
- target: mips64el
triple: mips64el-linux-gnuabi64
opts: DYNAMIC_ARCH=1 TARGET=GENERIC
- target: riscv64
triple: riscv64-linux-gnu
opts: TARGET=RISCV64_GENERIC
- target: mipsel
triple: mipsel-linux-gnu
opts: TARGET=MIPS1004K
- target: alpha
triple: alpha-linux-gnu
opts: TARGET=EV4
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install Dependencies
run: |
sudo apt-get install -y ccache gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-${{ matrix.target }}-cross
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Build OpenBLAS
run: |
make -j$(nproc) HOSTCC="ccache gcc" CC="ccache ${{ matrix.triple }}-gcc" FC="ccache ${{ matrix.triple }}-gfortran" ARCH=${{ matrix.target }} ${{ matrix.opts }}

117
.github/workflows/mips64.yml vendored Normal file
View File

@ -0,0 +1,117 @@
name: mips64 qemu test
on: [push, pull_request]
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- target: MIPS64_GENERIC
triple: mips64el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
- target: SICORTEX
triple: mips64el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=SICORTEX
- target: I6400
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=I6400
- target: P6600
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=P6600
- target: I6500
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=I6500
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
- name: checkout qemu
uses: actions/checkout@v3
with:
repository: qemu/qemu
path: qemu
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
- name: build qemu
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
make -j$(nproc)
make install
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-mips64el ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat

View File

@ -17,6 +17,10 @@ on:
# it only makes sense to test if this file has been changed
name: Nightly-Homebrew-Build
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build-OpenBLAS-with-Homebrew:
runs-on: macos-latest
@ -28,6 +32,8 @@ jobs:
HOMEBREW_NO_AUTO_UPDATE: "ON"
HOMEBREW_NO_BOTTLE_SOURCE_FALLBACK: "ON"
HOMEBREW_NO_INSTALL_CLEANUP: "ON"
HOMEBREW_NO_INSTALLED_DEPENDENTS_CHECK: "ON"
HOMEBREW_NO_INSTALL_FROM_API: "ON"
steps:
- name: Random delay for cron job

View File

@ -30,7 +30,7 @@ matrix:
before_script: &common-before
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
script:
- travis_wait 20 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- travis_wait 50 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -104,7 +104,7 @@ matrix:
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- travis_wait 20 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- travis_wait 50 make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -121,7 +121,7 @@ matrix:
- sudo apt-get update
- sudo apt-get install gcc-9 gfortran-9 -y
script:
- travis_wait 20 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- travis_wait 50 make QUIET_MAKE=1 BUILD_BFLOAT16=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -285,6 +285,25 @@ matrix:
- gfortran
script:
- travis_wait 45 make && make lapack-test
env:
- TARGET_BOX=NEOVERSE_N1
- &test-neon1-gcc8
os: linux
arch: arm64
dist: focal
group: edge
virt: lxd
compiler: gcc
addons:
apt:
packages:
- gcc-8
- gfortran-8
script:
- travis_wait 45 make QUIET_MAKE=1 CC=gcc-8 FC=gfortran-8 DYNAMIC_ARCH=1
env:
- TARGET_BOX=NEOVERSE_N1-GCC8
# whitelist
branches:

View File

@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 21)
set(OpenBLAS_PATCH_VERSION 22)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
@ -36,6 +36,8 @@ option(USE_LOCKING "Use locks even in single-threaded builds to make them callab
option(USE_PERL "Use the older PERL scripts for build preparation instead of universal shell scripts" OFF)
option(NO_WARMUP "Do not run a benchmark on each startup just to find the best location for the memory buffer" ON)
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON)
else()
@ -212,10 +214,10 @@ if(NOT NO_LAPACKE)
add_library(LAPACKE OBJECT ${LAPACKE_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>")
endif()
if(BUILD_RELAPACK)
add_library(RELAPACK OBJECT ${RELA_SOURCES})
list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
endif()
#if(BUILD_RELAPACK)
# add_library(RELAPACK OBJECT ${RELA_SOURCES})
# list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>")
#endif()
set(OpenBLAS_LIBS "")
if(BUILD_STATIC_LIBS)
add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
@ -236,7 +238,7 @@ endif()
set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME})
# Android needs to explicitly link against libm
if(ANDROID)
if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin")
if(BUILD_STATIC_LIBS)
target_link_libraries(${OpenBLAS_LIBNAME}_static m)
endif()
@ -396,7 +398,7 @@ if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "")
message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}")
endif()
if (NOT DEFINED USE_PERL)
if (NOT USE_PERL)
add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD
COMMAND ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" "${BUILD_BFLOAT16}" "${BUILD_SINGLE}" "${BUILD_DOUBLE}" "${BUILD_COMPLEX}" "${BUILD_COMPLEX16}" > ${PROJECT_BINARY_DIR}/objcopy.def
COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so

View File

@ -211,4 +211,5 @@ In chronological order:
* PLCT Lab, Institute of Software Chinese Academy of Sciences
* [2022-03] Support RISC-V Vector Intrinisc 1.0 version.
* Pablo Romero <https://github.com/pablorcum>
* [2022-08] Fix building from sources for QNX

View File

@ -80,7 +80,7 @@
SUN
Fujitsu
4. Suported precision
4. Supported precision
Now x86/x86_64 version support 80bit FP precision in addition to
normal double presicion and single precision. Currently only

View File

@ -110,6 +110,10 @@ ifeq ($(OSNAME), Darwin)
@echo "\"make PREFIX=/your_installation_path/ install\"."
@echo
@echo "(or set PREFIX in Makefile.rule and run make install."
@echo
@echo "Note that any flags passed to make during build should also be passed to make install"
@echo "to circumvent any install errors."
@echo
@echo "If you want to move the .dylib to a new location later, make sure you change"
@echo "the internal name of the dylib with:"
@echo
@ -118,8 +122,11 @@ endif
@echo
@echo "To install the library, you can run \"make PREFIX=/path/to/your/installation install\"."
@echo
@echo "Note that any flags passed to make during build should also be passed to make install"
@echo "to circumvent any install errors."
@echo
shared :
shared : libs netlib $(RELA)
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly))
@$(MAKE) -C exports so
@ -143,7 +150,7 @@ ifeq ($(OSNAME), CYGWIN_NT)
endif
endif
tests :
tests : libs netlib $(RELA) shared
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
touch $(LIBNAME)
ifndef NO_FBLAS
@ -271,7 +278,11 @@ prof_lapack : lapack_prebuild
lapack_prebuild :
ifeq ($(NO_LAPACK), $(filter 0,$(NO_LAPACK)))
-@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
ifeq ($(F_COMPILER), GFORTRAN)
-@echo "override FFLAGS = $(LAPACK_FFLAGS) -fno-tree-vectorize" >> $(NETLIB_LAPACK_DIR)/make.inc
else
-@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
endif
-@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc

5089
Makefile.L3 Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,42 +1,24 @@
CPP = $(CC) -E
RANLIB = ranlib
ifeq ($(LIBSUBARCH), EV4)
LIBNAME = $(LIBPREFIX)_ev4.a
LIBNAME_P = $(LIBPREFIX)_ev4_p.a
endif
ifeq ($(LIBSUBARCH), EV5)
LIBNAME = $(LIBPREFIX)_ev5.a
LIBNAME_P = $(LIBPREFIX)_ev5_p.a
endif
ifeq ($(LIBSUBARCH), EV6)
LIBNAME = $(LIBPREFIX)_ev6.a
LIBNAME_P = $(LIBPREFIX)_ev6_p.a
endif
ifneq ($(COMPILER), NATIVE)
# GCC User
ifeq ($(LIBSUBARCH), EV4)
OPTION += -DEV4 -mcpu=ev4
ifeq ($(CORE), EV4)
CCOMMON_OPT += -mcpu=ev4
endif
ifeq ($(LIBSUBARCH), EV5)
OPTION += -DEV5 -mcpu=ev5
ifeq ($(CORE), EV5)
CCOMMON_OPT += -mcpu=ev5
endif
ifeq ($(LIBSUBARCH), EV6)
OPTION += -DEV6 -mcpu=ev6
ifeq ($(CORE), EV6)
CCOMMON_OPT += -mcpu=ev6
endif
else
# Compaq Compiler User
ifeq ($(LIBSUBARCH), EV4)
OPTION += -DEV4 -tune ev4 -arch ev4
ifeq ($(CORE), EV4)
CCOMMON_OPT += -tune ev4 -arch ev4
endif
ifeq ($(LIBSUBARCH), EV5)
OPTION += -DEV5 -tune ev5 -arch ev5
ifeq ($(CORE), EV5)
CCOMMON_OPT += -tune ev5 -arch ev5
endif
ifeq ($(LIBSUBARCH), EV6)
OPTION += -DEV6 -tune ev6 -arch ev6
ifeq ($(CORE), EV6)
CCOMMON_OPT += -tune ev6 -arch ev6
endif
endif

View File

@ -89,17 +89,17 @@ endif
endif
# Use a72 tunings because Neoverse-V1 is only available
# in GCC>=9.4
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEV1)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=neoverse-v1
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
endif
else
CCOMMON_OPT += -march=armv8.4-a -mtune=native
CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.4-a -mtune=native
endif
@ -119,17 +119,21 @@ endif
endif
# Use a72 tunings because Neoverse-N2 is only available
# in GCC>=9.4
# in GCC>=10.4
ifeq ($(CORE), NEOVERSEN2)
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG)))
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10)))
ifeq ($(GCCVERSIONGTEQ10), 1)
ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11)))
ifneq ($(OSNAME), Darwin)
CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
else
CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
endif
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
endif
else
CCOMMON_OPT += -march=armv8.5-a -mtune=native
CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=armv8.5-a -mtune=native
endif

View File

@ -14,6 +14,11 @@ OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
PKG_EXTRALIB := $(EXTRALIB)
ifeq ($(INTERFACE64),1)
SUFFIX64=64
endif
PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
ifeq ($(USE_OPENMP), 1)
ifeq ($(C_COMPILER), PGI)
PKG_EXTRALIB += -lomp
@ -150,13 +155,19 @@ endif
endif
#Generating openblas.pc
@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
ifeq ($(INTERFACE64),1)
SUFFIX64=64
endif
PKGFILE="$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE)$(SUFFIX64).pc"
@echo Generating $(LIBSONAMEBASE)$(SUFFIX64).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@echo 'version='$(VERSION) >> "$(PKGFILE)"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
@cat openblas.pc.in >> "$(PKGFILE)"
#Generating OpenBLASConfig.cmake

View File

@ -1,3 +1,4 @@
MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs
ifdef BINARY64
else
endif

View File

@ -1,3 +1,4 @@
MSA_FLAGS = -mmsa -mfp64 -mload-store-pairs
ifdef BINARY64
else
endif

View File

@ -60,9 +60,9 @@ all: getarch_2nd
./getarch_2nd 1 >> $(TARGET_CONF)
$(TARGET_CONF): c_check$(SCRIPTSUFFIX) f_check$(SCRIPTSUFFIX) getarch
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(CC) $(TARGET_FLAGS) $(CFLAGS)
./c_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(CC)" $(TARGET_FLAGS) $(CFLAGS)
ifneq ($(ONLY_CBLAS), 1)
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) $(FC) $(TARGET_FLAGS)
./f_check$(SCRIPTSUFFIX) $(TARGET_MAKE) $(TARGET_CONF) "$(FC)" $(TARGET_FLAGS)
else
#When we only build CBLAS, we set NOFORTRAN=2
echo "NOFORTRAN=2" >> $(TARGET_MAKE)
@ -77,8 +77,8 @@ endif
getarch : getarch.c cpuid.S dummy $(CPUIDEMU)
avx512=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - $(CC) $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
avx512=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_AVX512); \
rv64gv=$$(./c_check$(SCRIPTSUFFIX) - - "$(CC)" $(TARGET_FLAGS) $(CFLAGS) | grep NO_RV64GV); \
$(HOSTCC) $(HOST_CFLAGS) $(EXFLAGS) $${avx512:+-D$${avx512}} $${rv64gv:+-D$${rv64gv}} -o $(@F) getarch.c cpuid.S $(CPUIDEMU)
getarch_2nd : getarch_2nd.c $(TARGET_CONF) dummy

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.21
VERSION = 0.3.22
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -131,6 +131,9 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of
# just adding its equivalents with a RELAPACK_ prefix
# RELAPACK_REPLACE = 1
# If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -207,7 +210,7 @@ NO_AFFINITY = 1
# to the user space. If bigphysarea is enabled, it will use it.
# DEVICEDRIVER_ALLOCATION = 1
# If you need to synchronize FP CSR between threads (for x86/x86_64 only).
# If you need to synchronize FP CSR between threads (for x86/x86_64 and aarch64 only).
# CONSISTENT_FPCSR = 1
# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute

View File

@ -9,6 +9,10 @@ ifndef TOPDIR
TOPDIR = .
endif
ifndef RELAPACK_REPLACE
RELAPACK_REPLACE=0
endif
# we need to use the host system's architecture for getarch compile options even especially when cross-compiling
HOSTARCH := $(shell uname -m)
ifeq ($(HOSTARCH), amd64)
@ -280,8 +284,10 @@ GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS)
# Generating Makefile.conf and config.h
DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all)
endif
ifndef TARGET_CORE
include $(TOPDIR)/Makefile.conf
-include $(TOPDIR)/Makefile.conf
else
HAVE_NEON=
HAVE_VFP=
@ -302,7 +308,6 @@ HAVE_FMA3=
include $(TOPDIR)/Makefile_kernel.conf
endif
endif
ifndef NUM_PARALLEL
NUM_PARALLEL = 1
@ -415,7 +420,7 @@ ifeq ($(OSNAME), AIX)
EXTRALIB += -lm
endif
ifeq ($(OSNAME), FreeBSD)
ifeq ($(OSNAME), $(filter $(OSNAME),FreeBSD OpenBSD NetBSD DragonFly))
ifeq ($(ARCH), $(filter $(ARCH),arm arm64))
EXTRALIB += -lm
endif
@ -660,8 +665,10 @@ DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += NEOVERSEN1
ifneq ($(NO_SVE), 1)
DYNAMIC_CORE += NEOVERSEV1
DYNAMIC_CORE += NEOVERSEN2
endif
DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX
@ -677,7 +684,12 @@ endif
endif
ifeq ($(ARCH), mips64)
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 MIPS64_GENERIC
ifdef DYNAMIC_LIST
override DYNAMIC_CORE = MIPS64_GENERIC $(DYNAMIC_LIST)
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_MIPS64_GENERIC
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
endif
endif
ifeq ($(ARCH), loongarch64)
@ -818,13 +830,32 @@ endif
ifeq ($(ARCH), riscv64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fdefault-integer-8
endif
ifeq ($(F_COMPILER), FLANG)
FCOMMON_OPT += -i8
endif
endif
endif
endif
ifeq ($(ARCH), loongarch64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fdefault-integer-8
endif
ifeq ($(F_COMPILER), FLANG)
FCOMMON_OPT += -i8
endif
endif
endif
endif
#
# C Compiler dependent settings
@ -856,6 +887,11 @@ CCOMMON_OPT += -mabi=32
BINARY_DEFINED = 1
endif
ifneq (, $(filter $(CORE), MIPS64_GENERIC))
CCOMMON_OPT += -DNO_MSA
FCOMMON_OPT += -DNO_MSA
endif
ifneq (, $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
CCOMMON_OPT += -march=loongson3a
FCOMMON_OPT += -march=loongson3a
@ -932,16 +968,19 @@ endif
endif
ifdef BINARY64
ifeq ($(ARCH), x86_64)
ifeq (,$(findstring tp,$(CFLAGS)))
ifneq ($(NEWPGI2),1)
CCOMMON_OPT += -tp p7-64
else
CCOMMON_OPT += -tp px
endif
endif
ifneq ($(NEWPGI),1)
CCOMMON_OPT += -D__MMX__ -Mnollvm
endif
else
ifeq ($(ARCH), power)
ifeq (,$(findstring tp,$(CFLAGS)))
ifeq ($(CORE), POWER8)
CCOMMON_OPT += -tp pwr8
endif
@ -950,14 +989,17 @@ CCOMMON_OPT += -tp pwr9
endif
endif
endif
endif
else
ifneq ($(NEWPGI2),1)
ifeq (,$(findstring tp,$(CFLAGS)))
CCOMMON_OPT += -tp p7
else
CCOMMON_OPT += -tp px
endif
endif
endif
endif
ifeq ($(C_COMPILER), PATHSCALE)
ifdef BINARY64
@ -1349,6 +1391,10 @@ ifeq ($(NO_AVX512), 1)
CCOMMON_OPT += -DNO_AVX512
endif
ifeq ($(NO_SVE), 1)
CCOMMON_OPT += -DNO_SVE
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER

View File

@ -130,6 +130,28 @@ endif
endif
endif
ifeq ($(CORE), ZEN)
ifdef HAVE_AVX512VL
ifndef NO_AVX512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
ifeq ($(OSNAME), WINNT)
ifeq ($(C_COMPILER), GCC)
CCOMMON_OPT += -fno-asynchronous-unwind-tables
FCOMMON_OPT += -fno-asynchronous-unwind-tables
endif
endif
endif
endif
endif
ifdef HAVE_AVX2
ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC)
@ -143,6 +165,7 @@ ifeq ($(C_COMPILER), CLANG)
CCOMMON_OPT += -mavx2
endif
endif
ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
ifeq ($(F_COMPILER), GFORTRAN)
# AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4)
@ -159,6 +182,7 @@ endif
endif
endif
endif
endif
endif

20
SECURITY.md Normal file
View File

@ -0,0 +1,20 @@
# Security Policy
## Supported Versions
It is generally recommended to use the latest release as this project
does not maintain multiple stable branches and providing packages e.g.
for Linux distributions is outside our scope. In particular, versions
before 0.3.18 can be assumed to carry the out-of-bounds-read error in
the LAPACK ?LARRV family of functions that was the subject of
CVE-2021-4048
## Reporting a Vulnerability
If you suspect that you have found a vulnerability - a defect that could
be abused to compromise the security of a user's code or systems - please
do not use the normal github issue tracker (except perhaps to post a general
warning if you deem that necessary). Instead, please contact the project
maintainers through the email addresses given in their github user profiles.
Defects found in the "lapack-netlib" subtree should ideally be reported to
the maintainers of the reference implementation of LAPACK, lapack@icl.itk.edu

View File

@ -65,6 +65,7 @@ MIPS1004K
MIPS24K
4.MIPS64 CPU:
MIPS64_GENERIC
SICORTEX
LOONGSON3A
LOONGSON3B
@ -128,3 +129,7 @@ LOONGSON2K1000
12. Elbrus E2000:
E2K
13. Alpha
EV4
EV5
EV6

View File

@ -141,7 +141,7 @@ jobs:
- job: OSX_OpenMP
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
steps:
- script: |
brew update
@ -151,15 +151,23 @@ jobs:
- job: OSX_GCC_Nothreads
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
steps:
- script: |
brew update
make USE_THREADS=0 CC=gcc-10 FC=gfortran-10
- job: OSX_GCC12
pool:
vmImage: 'macOS-latest'
steps:
- script: |
brew update
make CC=gcc-12 FC=gfortran-12
- job: OSX_OpenMP_Clang
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
@ -172,7 +180,7 @@ jobs:
- job: OSX_OpenMP_Clang_cmake
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
@ -188,7 +196,7 @@ jobs:
- job: OSX_dynarch_cmake
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
LIBRARY_PATH: /usr/local/opt/llvm/lib
@ -196,13 +204,13 @@ jobs:
- script: |
mkdir build
cd build
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST='NEHALEM HASWELL SKYLAKEX' -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON ..
cmake --build .
ctest
- job: OSX_Ifort_Clang
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
variables:
LD_LIBRARY_PATH: /usr/local/opt/llvm/lib
MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg
@ -235,7 +243,7 @@ jobs:
- job: OSX_NDK_ARMV7
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
steps:
- script: |
brew update
@ -255,7 +263,7 @@ jobs:
- job: OSX_IOS_ARMV7
pool:
vmImage: 'macOS-10.15'
vmImage: 'macOS-11'
variables:
CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1

View File

@ -74,6 +74,24 @@ static void *huge_malloc(BLASLONG size){
#endif
/* Benchmarks should allocate with cacheline (often 64 bytes) alignment
to avoid unreliable results. This technique, storing the allocated
pointer value just before the aligned memory, doesn't require
C11's aligned_alloc for compatibility with older compilers. */
static void *aligned_alloc_cacheline(size_t n)
{
void *p = malloc((size_t)(void *) + n + L1_DATA_LINESIZE - 1);
if (p) {
void **newp = (void **)
(((uintptr_t)p + L1_DATA_LINESIZE) & (uintptr_t)-L1_DATA_LINESIZE);
newp[-1] = p;
p = newp;
}
return p;
}
#define malloc aligned_alloc_cacheline
#define free(p) free((p) ? ((void **)(p))[-1] : (p))
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
#elif defined(__APPLE__)

View File

@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
int main(int argc, char *argv[]){
FLOAT *x, *y;
FLOAT *x;
FLOAT alpha[2] = { 2.0, 2.0 };
blasint m, i;
blasint inc_x=1,inc_y=1;
@ -74,10 +74,6 @@ int main(int argc, char *argv[]){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef __linux
srandom(getpid());
#endif
@ -91,30 +87,20 @@ int main(int argc, char *argv[]){
fprintf(stderr, " %6d : ", (int)m);
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
begin();
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
begin();
SCAL (&m, alpha, x, &inc_x);
}
end();
end();
time1 = getsec();
time1 = getsec();
timeg += time1;
}
timeg /= loops;
timeg = time1 / loops;
#ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg);

32
c_check
View File

@ -31,8 +31,8 @@ flags="$*"
cross_suffix=""
if [ "`dirname $compiler_name`" != '.' ]; then
cross_suffix="$cross_suffix`dirname $compiler_name`/"
if [ "`dirname \"$compiler_name\"`" != '.' ]; then
cross_suffix="$cross_suffix`dirname \"$compiler_name\"`/"
fi
bn=`basename $compiler_name`
@ -162,7 +162,7 @@ fi
exit 1
}
have_msa=0
no_msa=0
if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
tmpd="$(mktemp -d)"
tmpf="$tmpd/a.c"
@ -172,11 +172,10 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
printf "void main(void){ __asm__ volatile(%s); }\n" "$code" >> "$tmpf"
args="$msa_flags -o $tmpf.o $tmpf"
have_msa=1
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
have_msa=0
no_msa=1
}
rm -rf "$tmpd"
@ -240,6 +239,21 @@ if [ "$architecture" = "riscv64" ]; then
rm -rf "$tmpd"
fi
no_sve=0
if [ "$architecture" = "arm64" ]; then
tmpd=`mktemp -d`
tmpf="$tmpd/a.c"
printf "#include <arm_sve.h>\n\n int main(void){}\n">> "$tmpf"
args=" -march=armv8-a+sve -c -o $tmpf.o $tmpf"
no_sve=0
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_sve=1
}
rm -rf "$tmpd"
fi
c11_atomics=0
case "$data" in
*HAVE_C11*)
@ -375,10 +389,8 @@ done
printf "CROSS_SUFFIX=%s\n" "$cross_suffix"
[ "$cross" -ne 0 ] && printf "CROSS=1\n"
printf "CEXTRALIB=%s %s %s\n" "$linker_L" "$linker_l" "$linker_a"
[ "$have_msa" -eq 1 ] && {
printf "HAVE_MSA=1\n"
printf "MSA_FLAGS=%s\n" "$msa_flags"
}
[ "$no_msa" -eq 1 ] && printf "NO_MSA=1\n"
[ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
[ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
@ -396,7 +408,7 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
[ "$binformat" = "bin32" ] && printf "#define __32BIT__\t1\n"
[ "$binformat" = "bin64" ] && printf "#define __64BIT__\t1\n"
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
[ "$have_msa" -eq 1 ] && printf "#define HAVE_MSA\t1\n"
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
} >> "$config"

View File

@ -44,9 +44,12 @@ endif ()
if (DYNAMIC_ARCH)
if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE "${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2")
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
endif ()
endif ()
@ -132,3 +135,8 @@ if (ARM64)
set(BINARY_DEFINED 1)
endif ()
if (${ARCH} STREQUAL "riscv64")
set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1)
endif ()

View File

@ -144,6 +144,21 @@ if (${CORE} STREQUAL SAPPHIRERAPIDS)
endif ()
endif ()
if (${CORE} STREQUAL ZEN)
if (HAVE_AVX512VL)
if (NOT DYNAMIC_ARCH)
if (NOT NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 13.0 OR ${GCC_VERSION} VERSION_EQUAL 13.0)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=znver4")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
endif ()
endif ()
endif ()
endif ()
endif ()
if (${CORE} STREQUAL A64FX)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
@ -155,6 +170,39 @@ if (${CORE} STREQUAL A64FX)
endif ()
endif ()
if (${CORE} STREQUAL NEOVERSEN2)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL NEOVERSEV1)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL NEOVERSEN1)
if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 9.4 OR ${GCC_VERSION} VERSION_EQUAL 9.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=neoverse-n1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif ()
if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")

View File

@ -46,7 +46,7 @@ if (${F_COMPILER} STREQUAL "GFORTRAN")
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
if (NOT NO_LAPACK)
set(EXTRALIB "{EXTRALIB} -lgfortran")
set(EXTRALIB "${EXTRALIB} -lgfortran")
endif ()
if (NO_BINARY_MODE)
if (MIPS64)

View File

@ -22,7 +22,7 @@ set(SCLAUX
slasd7.f slasd8.f slasda.f slasdq.f slasdt.f
slaset.f slasq1.f slasq2.f slasq3.f slasq4.f slasq5.f slasq6.f
slasr.f slasrt.f slassq.f90 slasv2.f spttrf.f sstebz.f sstedc.f
ssteqr.f ssterf.f slaisnan.f sisnan.f
ssteqr.f ssterf.f slaisnan.f sisnan.f slarmm.f
slartgp.f slartgs.f ../INSTALL/sroundup_lwork.f
../INSTALL/second_${TIMER}.f)
@ -42,7 +42,7 @@ set(DZLAUX
dlasd7.f dlasd8.f dlasda.f dlasdq.f dlasdt.f
dlasq1.f dlasq2.f dlasq3.f dlasq4.f dlasq5.f dlasq6.f
dlasr.f dlasrt.f dlassq.f90 dlasv2.f dpttrf.f dstebz.f dstedc.f
dsteqr.f dsterf.f dlaisnan.f disnan.f
dsteqr.f dsterf.f dlaisnan.f disnan.f dlarmm.f
dlartgp.f dlartgs.f ../INSTALL/droundup_lwork.f
../INSTALL/dlamch.f ../INSTALL/dsecnd_${TIMER}.f)
@ -123,7 +123,8 @@ set(SLASRC
ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
sgesvdq.f slaorhr_col_getrfnp.f
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f )
slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f
slatrs3.f strsyl3.f sgelst.f)
set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@ -221,7 +222,8 @@ set(CLASRC
cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
cungtsqr.f cungtsqr_row.f cunhr_col.f )
cungtsqr.f cungtsqr_row.f cunhr_col.f
clatrs3.f ctrsyl3.f cgelst.f)
set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@ -313,7 +315,8 @@ set(DLASRC
dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f )
dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f
dlatrs3.f dtrsyl3.f dgelst.f)
set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@ -415,7 +418,8 @@ set(ZLASRC
zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
zungtsqr.f zungtsqr_row.f zunhr_col.f)
zungtsqr.f zungtsqr_row.f zunhr_col.f
zlatrs3.f ztrsyl3.f zgelst.f)
set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@ -519,7 +523,7 @@ set(SCLAUX
slaset.c slasq1.c slasq2.c slasq3.c slasq4.c slasq5.c slasq6.c
slasr.c slasrt.c slassq.c slasv2.c spttrf.c sstebz.c sstedc.c
ssteqr.c ssterf.c slaisnan.c sisnan.c
slartgp.c slartgs.c
slartgp.c slartgs.c slarmm.c
../INSTALL/second_${TIMER}.c)
set(DZLAUX
@ -538,7 +542,7 @@ set(DZLAUX
dlasq1.c dlasq2.c dlasq3.c dlasq4.c dlasq5.c dlasq6.c
dlasr.c dlasrt.c dlassq.c dlasv2.c dpttrf.c dstebz.c dstedc.c
dsteqr.c dsterf.c dlaisnan.c disnan.c
dlartgp.c dlartgs.c
dlartgp.c dlartgs.c dlarmm.c
../INSTALL/dlamch.c ../INSTALL/dsecnd_${TIMER}.c)
set(SLASRC
@ -617,7 +621,8 @@ set(SLASRC
ssyevd_2stage.c ssyev_2stage.c ssyevx_2stage.c ssyevr_2stage.c
ssbev_2stage.c ssbevx_2stage.c ssbevd_2stage.c ssygv_2stage.c
sgesvdq.c slaorhr_col_getrfnp.c
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c )
slaorhr_col_getrfnp2.c sorgtsqr.c sorgtsqr_row.c sorhr_col.c
slatrs3.c strsyl3.c sgelst.c)
set(SXLASRC sgesvxx.c sgerfsx.c sla_gerfsx_extended.c sla_geamv.c
sla_gercond.c sla_gerpvgrw.c ssysvxx.c ssyrfsx.c
@ -714,7 +719,8 @@ set(CLASRC
cheevd_2stage.c cheev_2stage.c cheevx_2stage.c cheevr_2stage.c
chbev_2stage.c chbevx_2stage.c chbevd_2stage.c chegv_2stage.c
cgesvdq.c claunhr_col_getrfnp.c claunhr_col_getrfnp2.c
cungtsqr.c cungtsqr_row.c cunhr_col.c )
cungtsqr.c cungtsqr_row.c cunhr_col.c
clatrs3.c ctrsyl3.c cgelst.c)
set(CXLASRC cgesvxx.c cgerfsx.c cla_gerfsx_extended.c cla_geamv.c
cla_gercond_c.c cla_gercond_x.c cla_gerpvgrw.c
@ -805,7 +811,8 @@ set(DLASRC
dsyevd_2stage.c dsyev_2stage.c dsyevx_2stage.c dsyevr_2stage.c
dsbev_2stage.c dsbevx_2stage.c dsbevd_2stage.c dsygv_2stage.c
dcombssq.c dgesvdq.c dlaorhr_col_getrfnp.c
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c )
dlaorhr_col_getrfnp2.c dorgtsqr.c dorgtsqr_row.c dorhr_col.c
dlatrs3.c dtrsyl3.c dgelst.c)
set(DXLASRC dgesvxx.c dgerfsx.c dla_gerfsx_extended.c dla_geamv.c
dla_gercond.c dla_gerpvgrw.c dsysvxx.c dsyrfsx.c
@ -906,7 +913,7 @@ set(ZLASRC
zheevd_2stage.c zheev_2stage.c zheevx_2stage.c zheevr_2stage.c
zhbev_2stage.c zhbevx_2stage.c zhbevd_2stage.c zhegv_2stage.c
zgesvdq.c zlaunhr_col_getrfnp.c zlaunhr_col_getrfnp2.c
zungtsqr.c zungtsqr_row.c zunhr_col.c)
zungtsqr.c zungtsqr_row.c zunhr_col.c zlatrs3.c ztrsyl3.c zgelst.c)
set(ZXLASRC zgesvxx.c zgerfsx.c zla_gerfsx_extended.c zla_geamv.c
zla_gercond_c.c zla_gercond_x.c zla_gerpvgrw.c zsysvxx.c zsyrfsx.c
@ -999,6 +1006,9 @@ endforeach ()
if (NOT C_LAPACK)
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS}")
if (${F_COMPILER} STREQUAL "GFORTRAN")
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_FFLAGS} -fno-tree-vectorize")
endif()
else ()
set_source_files_properties(${LA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}")
endif ()

View File

@ -318,6 +318,8 @@ set(CSRC
lapacke_clacn2.c
lapacke_clag2z.c
lapacke_clag2z_work.c
lapacke_clangb.c
lapacke_clangb_work.c
lapacke_clange.c
lapacke_clange_work.c
lapacke_clanhe.c
@ -803,6 +805,8 @@ set(DSRC
lapacke_dlag2s_work.c
lapacke_dlamch.c
lapacke_dlamch_work.c
lapacke_dlangb.c
lapacke_dlangb_work.c
lapacke_dlange.c
lapacke_dlange_work.c
lapacke_dlansy.c
@ -1381,6 +1385,8 @@ set(SSRC
lapacke_slag2d_work.c
lapacke_slamch.c
lapacke_slamch_work.c
lapacke_slangb.c
lapacke_slangb_work.c
lapacke_slange.c
lapacke_slange_work.c
lapacke_slansy.c
@ -2089,6 +2095,8 @@ set(ZSRC
lapacke_zlacrm_work.c
lapacke_zlag2c.c
lapacke_zlag2c_work.c
lapacke_zlangb.c
lapacke_zlangb_work.c
lapacke_zlange.c
lapacke_zlange_work.c
lapacke_zlanhe.c
@ -2481,6 +2489,8 @@ set(Utils_SRC
lapacke_ctp_nancheck.c lapacke_dtr_trans.c lapacke_str_trans.c lapacke_ztp_trans.c
lapacke_ctp_trans.c lapacke_lsame.c lapacke_xerbla.c lapacke_ztr_nancheck.c
lapacke_ctr_nancheck.c lapacke_make_complex_double.c lapacke_z_nancheck.c lapacke_ztr_trans.c
lapacke_ctz_nancheck.c lapacke_ctz_trans.c lapacke_dtz_nancheck.c lapacke_dtz_trans.c
lapacke_stz_nancheck.c lapacke_stz_trans.c lapacke_ztz_nancheck.c lapacke_ztz_trans.c
)
set(LAPACKE_REL_SRC "")

View File

@ -2,7 +2,7 @@ libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libsuffix=@SUFFIX64_UNDERSCORE@
includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
openblas_config=USE_64BITINT=@USE_64BITINT@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@
Name: OpenBLAS
Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
Version: @OPENBLAS_VERSION@

View File

@ -823,6 +823,41 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(CGEMM3M_UNROLL_N 4)
set(ZGEMM3M_UNROLL_M 4)
set(ZGEMM3M_UNROLL_N 4)
elseif ("${TCORE}" STREQUAL "ARMV5")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
"#define L2_SIZE\t512488\n"
"#define L2_LINESIZE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t4\n")
set(SGEMM_UNROLL_M 2)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 2)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
elseif ("${TCORE}" STREQUAL "ARMV6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t32\n"
"#define L2_SIZE\t512488\n"
"#define L2_LINESIZE\t32\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n"
"#define L2_ASSOCIATIVE\t4\n"
"#define HAVE_VFP\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 2)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 2)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 2)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 2)
elseif ("${TCORE}" STREQUAL "ARMV7")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE\t65536\n"
@ -886,7 +921,11 @@ else ()
set(SGEMM_UNROLL_M 8)
set(SGEMM_UNROLL_N 8)
endif ()
if ("${TCORE}" STREQUAL "CORTEXA53")
set(DGEMM_UNROLL_M 4)
else ()
set(DGEMM_UNROLL_M 8)
endif ()
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
@ -1319,16 +1358,25 @@ else(NOT CMAKE_CROSSCOMPILING)
set(GETARCH_DIR "${PROJECT_BINARY_DIR}/getarch_build")
set(GETARCH_BIN "getarch${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH_DIR})
configure_file(${TARGET_CONF_TEMP} ${GETARCH_DIR}/${TARGET_CONF} COPYONLY)
file(MAKE_DIRECTORY "${GETARCH_DIR}")
configure_file("${TARGET_CONF_TEMP}" "${GETARCH_DIR}/${TARGET_CONF}" COPYONLY)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH_RESULT ${GETARCH_DIR}
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN}
)
if (CMAKE_ASM_COMPILER_ID STREQUAL "")
try_compile(GETARCH_RESULT "${GETARCH_DIR}"
SOURCES ${GETARCH_SRC}
CMAKE_FLAGS "-DCMAKE_ASM_COMPILER=${CMAKE_C_COMPILER}"
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}"
)
else()
try_compile(GETARCH_RESULT "${GETARCH_DIR}"
SOURCES ${GETARCH_SRC}
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH_LOG
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH_BIN}"
)
endif()
if (NOT ${GETARCH_RESULT})
MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
endif ()
@ -1357,19 +1405,19 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE
message(STATUS "GETARCH results:\n${GETARCH_MAKE_OUT}")
# append config data from getarch to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH_CONF_OUT})
file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH_CONF_OUT})
ParseGetArchVars(${GETARCH_MAKE_OUT})
set(GETARCH2_DIR "${PROJECT_BINARY_DIR}/getarch2_build")
set(GETARCH2_BIN "getarch_2nd${CMAKE_EXECUTABLE_SUFFIX}")
file(MAKE_DIRECTORY ${GETARCH2_DIR})
configure_file(${TARGET_CONF_TEMP} ${GETARCH2_DIR}/${TARGET_CONF} COPYONLY)
file(MAKE_DIRECTORY "${GETARCH2_DIR}")
configure_file("${TARGET_CONF_TEMP}" "${GETARCH2_DIR}/${TARGET_CONF}" COPYONLY)
if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")
try_compile(GETARCH2_RESULT ${GETARCH2_DIR}
SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c
try_compile(GETARCH2_RESULT "${GETARCH2_DIR}"
SOURCES "${PROJECT_SOURCE_DIR}/getarch_2nd.c"
COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}"
OUTPUT_VARIABLE GETARCH2_LOG
COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN}
COPY_FILE "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}"
)
if (NOT ${GETARCH2_RESULT})
@ -1382,9 +1430,9 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 0 OUTPUT_VARIABL
execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH2_BIN}" 1 OUTPUT_VARIABLE GETARCH2_CONF_OUT)
# append config data from getarch_2nd to the TARGET file and read in CMake vars
file(APPEND ${TARGET_CONF_TEMP} ${GETARCH2_CONF_OUT})
file(APPEND "${TARGET_CONF_TEMP}" ${GETARCH2_CONF_OUT})
configure_file(${TARGET_CONF_TEMP} ${TARGET_CONF_DIR}/${TARGET_CONF} COPYONLY)
configure_file("${TARGET_CONF_TEMP}" "${TARGET_CONF_DIR}/${TARGET_CONF}" COPYONLY)
ParseGetArchVars(${GETARCH2_MAKE_OUT})

View File

@ -172,9 +172,9 @@ if (DEFINED TARGET)
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake -exhaustive-register-search")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search")
endif()
endif()
endif()
@ -188,23 +188,45 @@ if (DEFINED TARGET)
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids -exhaustive-register-search")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512 -exhaustive-register-search")
endif()
endif()
endif()
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
if (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search")
endif()
endif()
if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
if (((${TARGET} STREQUAL ZEN) AND HAVE_AVX512VL) AND NOT NO_AVX512)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.99)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 15.99)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=znver4")
else()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -exhaustive-register-search")
endif()
endif()
if ((${TARGET} STREQUAL HASWELL OR (${TARGET} STREQUAL ZEN AND NOT HAVE_AVX512VL)) AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma")
endif()
endif()
if (DEFINED HAVE_AVX)

View File

@ -90,7 +90,7 @@ extern "C" {
#endif
#include <time.h>
#ifdef OS_LINUX
#if defined(OS_LINUX) || defined(OS_QNX)
#include <malloc.h>
#include <sched.h>
#endif
@ -107,7 +107,7 @@ extern "C" {
#endif
#endif
#ifdef OS_HAIKU
#if defined(OS_HAIKU) || defined(OS_QNX)
#define NO_SYSV_IPC
#endif
@ -387,6 +387,10 @@ typedef int blasint;
#endif
*/
#ifdef __EMSCRIPTEN__
#define YIELDING
#endif
#ifndef YIELDING
#define YIELDING sched_yield()
#endif

View File

@ -43,7 +43,7 @@
#define MB asm("mb")
#define WMB asm("wmb")
#define RMB asm("rmb")
#define RMB asm("mb")
static void __inline blas_lock(unsigned long *address){
#ifndef __DECC

View File

@ -2612,7 +2612,7 @@
#ifndef ASSEMBLER
#if !defined(DYNAMIC_ARCH) \
&& (defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) \
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K))
|| defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) || defined(ARCH_ALPHA))
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

View File

@ -86,7 +86,9 @@ static inline unsigned int rpcc(void){
//__asm__ __volatile__("dmfc0 %0, $25, 1": "=r"(tmp):: "memory");
//ret=tmp;
__asm__ __volatile__(".set push \n"
#if !defined(__mips_isa_rev) || __mips_isa_rev < 2
".set mips32r2\n"
#endif
"rdhwr %0, $2\n"
".set pop": "=r"(ret):: "memory");
@ -99,7 +101,9 @@ static inline unsigned int rpcc(void){
static inline int WhereAmI(void){
int ret=0;
__asm__ __volatile__(".set push \n"
#if !defined(__mips_isa_rev) || __mips_isa_rev < 2
".set mips32r2\n"
#endif
"rdhwr %0, $0\n"
".set pop": "=r"(ret):: "memory");
return ret;
@ -197,9 +201,15 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#if defined(__mips_isa_rev) && __mips_isa_rev >= 6
#define ASSEMBLER_ARCH mips64r6
#else
#define ASSEMBLER_ARCH mips64
#endif
#define PROLOGUE \
.text ;\
.set mips64 ;\
.set ASSEMBLER_ARCH ;\
.align 5 ;\
.globl REALNAME ;\
.ent REALNAME ;\

View File

@ -47,9 +47,10 @@ typedef struct {
int dtb_entries;
int offsetA, offsetB, align;
#ifdef BUILD_BFLOAT16
#if BUILD_BFLOAT16 == 1
int sbgemm_p, sbgemm_q, sbgemm_r;
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
int sbgemm_align_k;
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
@ -160,51 +161,59 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
#endif
#endif
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
#if (BUILD_SINGLE == 1) || (BUILD_DOUBLE == 1) || (BUILD_COMPLEX == 1) || (BUILD_COMPLEX16 == 1)
int sgemm_p, sgemm_q, sgemm_r;
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
#endif
int exclusive_cache;
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
#if (BUILD_SINGLE == 1) || (BUILD_COMPLEX == 1)
float (*samax_k) (BLASLONG, float *, BLASLONG);
float (*samin_k) (BLASLONG, float *, BLASLONG);
float (*smax_k) (BLASLONG, float *, BLASLONG);
float (*smin_k) (BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE ==1) || (BUILD_COMPLEX==1)
BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG);
BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG);
BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
float (*ssum_k) (BLASLONG, float *, BLASLONG);
#endif
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
#ifdef ARCH_X86_64
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
@ -219,7 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1)
#ifdef SMALL_MATRIX_OPT
int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta);
@ -255,7 +264,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
#endif
#if (BUILD_SINGLE==1)
int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -287,12 +297,12 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int dgemm_p, dgemm_q, dgemm_r;
int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn;
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
double (*damax_k) (BLASLONG, double *, BLASLONG);
double (*damin_k) (BLASLONG, double *, BLASLONG);
double (*dmax_k) (BLASLONG, double *, BLASLONG);
@ -301,23 +311,21 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG);
BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG);
BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG);
BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
double (*dsum_k) (BLASLONG, double *, BLASLONG);
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
#endif
#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE)
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1)
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -325,13 +333,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
#endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -340,7 +348,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1)
#ifdef SMALL_MATRIX_OPT
int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta);
@ -354,6 +362,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
#endif
#endif
#if (BUILD_DOUBLE==1)
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
@ -500,23 +510,25 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
#endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX==1)
int cgemm_p, cgemm_q, cgemm_r;
int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn;
float (*camax_k) (BLASLONG, float *, BLASLONG);
float (*camin_k) (BLASLONG, float *, BLASLONG);
BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG);
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -710,7 +722,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#endif
#ifdef BUILD_COMPLEX16
#if (BUILD_COMPLEX16 == 1)
int zgemm_p, zgemm_q, zgemm_r;
int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn;
@ -1092,34 +1104,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
void (*init)(void);
int snum_opt, dnum_opt, qnum_opt;
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX==1)
int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX16
#if (BUILD_COMPLEX16==1)
int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX==1)
int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
@ -1131,7 +1143,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX16
#if (BUILD_COMPLEX16==1)
int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
@ -1143,21 +1155,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX==1)
int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
@ -1169,7 +1181,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
#endif
#ifdef BUILD_COMPLEX16
#if (BUILD_COMPLEX16==1)
int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
@ -1181,16 +1193,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
#endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE==1)
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
#endif
#ifdef BUILD_DOUBLE
#if (BUILD_DOUBLE==1)
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
#endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX==1)
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
#endif
#ifdef BUILD_COMPLEX16
#if (BUILD_COMPLEX16==1)
int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG);
#endif
} gotoblas_t;
@ -1206,7 +1218,7 @@ extern gotoblas_t *gotoblas;
#define HAVE_EX_L2 gotoblas -> exclusive_cache
#ifdef BUILD_BFLOAT16
#if (BUILD_BFLOAT16==1)
#define SBGEMM_P gotoblas -> sbgemm_p
#define SBGEMM_Q gotoblas -> sbgemm_q
#define SBGEMM_R gotoblas -> sbgemm_r
@ -1215,7 +1227,7 @@ extern gotoblas_t *gotoblas;
#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn
#endif
#if defined (BUILD_SINGLE)
#if (BUILD_SINGLE==1)
#define SGEMM_P gotoblas -> sgemm_p
#define SGEMM_Q gotoblas -> sgemm_q
#define SGEMM_R gotoblas -> sgemm_r
@ -1224,30 +1236,14 @@ extern gotoblas_t *gotoblas;
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
#endif
#if defined (BUILD_DOUBLE)
#if (BUILD_DOUBLE==1)
#define DGEMM_P gotoblas -> dgemm_p
#define DGEMM_Q gotoblas -> dgemm_q
#define DGEMM_R gotoblas -> dgemm_r
#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
#endif
#define QGEMM_P gotoblas -> qgemm_p
#define QGEMM_Q gotoblas -> qgemm_q
#define QGEMM_R gotoblas -> qgemm_r
#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m
#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n
#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn
#ifdef BUILD_COMPLEX
#define CGEMM_P gotoblas -> cgemm_p
#define CGEMM_Q gotoblas -> cgemm_q
#define CGEMM_R gotoblas -> cgemm_r
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
#ifndef BUILD_SINGLE
#if (BUILD_SINGLE != 1)
#define SGEMM_P gotoblas -> sgemm_p
#define SGEMM_Q gotoblas -> sgemm_q
#define SGEMM_R 1024
@ -1257,14 +1253,38 @@ extern gotoblas_t *gotoblas;
#endif
#endif
#ifdef BUILD_COMPLEX16
#define QGEMM_P gotoblas -> qgemm_p
#define QGEMM_Q gotoblas -> qgemm_q
#define QGEMM_R gotoblas -> qgemm_r
#define QGEMM_UNROLL_M gotoblas -> qgemm_unroll_m
#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n
#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn
#if (BUILD_COMPLEX==1)
#define CGEMM_P gotoblas -> cgemm_p
#define CGEMM_Q gotoblas -> cgemm_q
#define CGEMM_R gotoblas -> cgemm_r
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
#if (BUILD_SINGLE != 1)
#define SGEMM_P gotoblas -> sgemm_p
#define SGEMM_Q gotoblas -> sgemm_q
#define SGEMM_R 1024
#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m
#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
#endif
#endif
#if (BUILD_COMPLEX16==1)
#define ZGEMM_P gotoblas -> zgemm_p
#define ZGEMM_Q gotoblas -> zgemm_q
#define ZGEMM_R gotoblas -> zgemm_r
#define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m
#define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n
#define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn
#ifndef BUILD_DOUBLE
#if (BUILD_DOUBLE != 1)
#define DGEMM_P gotoblas -> dgemm_p
#define DGEMM_Q gotoblas -> dgemm_q
#define DGEMM_R 1024
@ -1272,6 +1292,14 @@ extern gotoblas_t *gotoblas;
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
#endif
#if (BUILD_COMPLEX != 1)
#define CGEMM_P gotoblas -> cgemm_p
#define CGEMM_Q gotoblas -> cgemm_q
#define CGEMM_R gotoblas -> cgemm_r
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
#endif
#endif
#define XGEMM_P gotoblas -> xgemm_p
@ -1318,7 +1346,7 @@ extern gotoblas_t *gotoblas;
#define HAVE_EX_L2 0
#endif
#ifdef BUILD_BFLOAT16
#if (BUILD_BFLOAT16 == 1)
#define SBGEMM_P SBGEMM_DEFAULT_P
#define SBGEMM_Q SBGEMM_DEFAULT_Q
#define SBGEMM_R SBGEMM_DEFAULT_R

View File

@ -53,6 +53,7 @@ extern void goto_set_num_threads(int nthreads);
/* Global Parameter */
extern int blas_cpu_number;
extern int blas_num_threads;
extern int blas_num_threads_set;
extern int blas_omp_linked;
#define BLAS_LEGACY 0x8000U
@ -139,7 +140,11 @@ extern int blas_server_avail;
static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP
int openmp_nthreads=omp_get_max_threads();
int openmp_nthreads;
if (blas_num_threads_set == 0)
openmp_nthreads=omp_get_max_threads();
else
openmp_nthreads=blas_cpu_number;
#endif
#ifndef USE_OPENMP

View File

@ -59,6 +59,11 @@ void get_subarchitecture(void){
printf("ev%d", implver() + 4);
}
void get_corename(void){
printf("EV%d", implver() + 4);
}
void get_subdirname(void){
printf("alpha");
}

View File

@ -202,10 +202,14 @@ int detect(void)
return CPU_CORTEXA510;
else if (strstr(cpu_part, "0xd47"))
return CPU_CORTEXA710;
else if (strstr(cpu_part, "0xd4d")) //A715
return CPU_CORTEXA710;
else if (strstr(cpu_part, "0xd44"))
return CPU_CORTEXX1;
else if (strstr(cpu_part, "0xd4c"))
return CPU_CORTEXX2;
else if (strstr(cpu_part, "0xd4e")) //X3
return CPU_CORTEXX2;
}
// Qualcomm
else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00"))

View File

@ -165,7 +165,9 @@ void get_cpuconfig(void){
}else{
printf("#define UNKNOWN\n");
}
if (!get_feature("msa")) printf("#define NO_MSA\n");
#ifndef NO_MSA
if (get_feature("msa")) printf("#define HAVE_MSA\n");
#endif
}
void get_libname(void){

View File

@ -70,16 +70,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3R3 2
#define CPU_LOONGSON3R4 3
#define CPU_I6400 4
#define CPU_P6600 5
#define CPU_I6500 6
#define CPU_UNKNOWN 0
#define CPU_MIPS64_GENERIC 1
#define CPU_SICORTEX 2
#define CPU_LOONGSON3R3 3
#define CPU_LOONGSON3R4 4
#define CPU_I6400 5
#define CPU_P6600 6
#define CPU_I6500 7
static char *cpuname[] = {
"UNKNOWN",
"MIPS64_GENERIC"
"SICORTEX",
"LOONGSON3R3",
"LOONGSON3R4",
@ -113,8 +115,11 @@ int detect(void){
return CPU_SICORTEX;
}
}
return CPU_MIPS64_GENERIC;
#else
return CPU_UNKNOWN;
#endif
return CPU_UNKNOWN;
}
char *get_corename(void){
@ -136,8 +141,10 @@ void get_subarchitecture(void){
printf("P6600");
}else if(detect()==CPU_I6500){
printf("I6500");
}else{
}else if(detect()==CPU_SICORTEX){
printf("SICORTEX");
}else{
printf("MIPS64_GENERIC");
}
}
@ -201,7 +208,9 @@ void get_cpuconfig(void){
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 8\n");
}
if (!get_feature("msa")) printf("#define NO_MSA\n");
#ifndef NO_MSA
if (get_feature("msa")) printf("#define HAVE_MSA\n");
#endif
}
void get_libname(void){
@ -215,8 +224,8 @@ void get_libname(void){
printf("p6600\n");
}else if(detect()==CPU_I6500) {
printf("i6500\n");
}else{
printf("mips64\n");
}else {
printf("mips64_generic\n");
}
}

View File

@ -1544,6 +1544,17 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
}
break;
case 11: //family 6 exmodel 11
switch (model) {
case 7: // Raptor Lake
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())
return CPUTYPE_SANDYBRIDGE;
else
return CPUTYPE_NEHALEM;
}
break;
}
break;
case 0x7:
@ -2334,6 +2345,18 @@ int get_coretype(void){
return CORE_NEHALEM;
}
case 11:
switch (model) {
case 7: // Raptor Lake
#ifndef NO_AVX2
if(support_avx2())
return CORE_HASWELL;
#endif
if(support_avx())
return CORE_SANDYBRIDGE;
else
return CORE_NEHALEM;
}
case 15:
if (model <= 0x2) return CORE_NORTHWOOD;
else return CORE_PRESCOTT;

View File

@ -173,3 +173,8 @@ HAVE_C11
ARCH_E2K
#endif
#if defined(__EMSCRIPTEN__)
ARCH_RISCV64
OS_WINDOWS
#endif

View File

@ -40,7 +40,7 @@ else()
c_${float_char}blas1.c)
endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m)
endif()
add_test(NAME "x${float_char}cblat1"
@ -65,7 +65,7 @@ else()
constant.c)
endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m)
endif()
add_test(NAME "x${float_char}cblat2"
@ -90,7 +90,7 @@ else()
constant.c)
endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m)
endif()
add_test(NAME "x${float_char}cblat3"

View File

@ -237,7 +237,7 @@ endif
ifeq ($(BUILD_DOUBLE),1)
# Double real
ifeq ($(NOFORTRAN),0)
ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN)))
xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
@ -256,7 +256,7 @@ endif
ifeq ($(BUILD_COMPLEX),1)
# Single complex
ifeq ($(NOFORTRAN),0)
ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN)))
xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
@ -278,7 +278,7 @@ endif
ifeq ($(BUILD_COMPLEX16),1)
# Double complex
ifeq ($(NOFORTRAN),0)
ifeq ($(NOFORTRAN), $(filter 0 2,$(NOFORTRAN)))
xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)

View File

@ -969,7 +969,7 @@ real *sfac;
1.17 };
/* Local variables */
extern /* Subroutine */ srottest_();
extern /* Subroutine */ void srottest_();
static integer i__, k, ksize;
extern /* Subroutine */ int stest_(), srotmtest_();
static integer ki, kn;

View File

@ -304,6 +304,15 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M;
}
BLASLONG pad_min_l = min_l;
#if defined(HALF)
#if defined(DYNAMIC_ARCH)
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
#else
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
#endif
#endif
/* First, we have to move data A to L2 cache */
min_i = m_to - m_from;
l1stride = 1;
@ -350,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
sb + min_l * (jjs - js) * COMPSIZE * l1stride);
sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
STOP_RPCC(outercost);
@ -358,10 +367,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#if !defined(XDOUBLE) || !defined(QUAD_PRECISION)
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
#else
KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha,
sa, sb + min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs);
#endif
STOP_RPCC(kernelcost);

View File

@ -325,6 +325,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
if (min_l > GEMM_Q) min_l = (min_l + 1) / 2;
}
BLASLONG pad_min_l = min_l;
#if defined(HALF)
#if defined(DYNAMIC_ARCH)
pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1);
#else
pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);;
#endif
#endif
/* Determine step size in m
* Note: We are currently on the first step in m
*/
@ -382,13 +392,13 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
/* Copy part of local region of B into workspace */
START_RPCC();
OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,
buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride);
buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride);
STOP_RPCC(copy_B);
/* Apply kernel with local region of A and part of local region of B */
START_RPCC();
KERNEL_OPERATION(min_i, min_jj, min_l, alpha,
sa, buffer[bufferside] + min_l * (jjs - js) * COMPSIZE * l1stride,
sa, buffer[bufferside] + pad_min_l * (jjs - js) * COMPSIZE * l1stride,
c, ldc, m_from, jjs);
STOP_RPCC(kernel);

View File

@ -470,9 +470,13 @@ blas_queue_t *tscq;
#endif
#ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#endif
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
@ -746,9 +750,13 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
queue -> position = pos;
#ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
#endif
#endif
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)

View File

@ -69,6 +69,8 @@
int blas_server_avail = 0;
extern int openblas_omp_adaptive_env();
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#ifdef HAVE_C11
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
@ -98,6 +100,8 @@ static void adjust_thread_buffers() {
void goto_set_num_threads(int num_threads) {
blas_num_threads_set = 1;
if (num_threads < 0) blas_num_threads_set = 0;
if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -108,8 +112,6 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;
omp_set_num_threads(blas_cpu_number);
adjust_thread_buffers();
#if defined(ARCH_MIPS64)
//set parameters for different number of threads.
@ -282,8 +284,12 @@ static void exec_threads(blas_queue_t *queue, int buf_index){
sb = queue -> sb;
#ifdef CONSISTENT_FPCSR
#ifdef __aarch64__
__asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
#else
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#endif
if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
@ -381,8 +387,12 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
#ifdef CONSISTENT_FPCSR
for (i = 0; i < num; i ++) {
#ifdef __aarch64__
__asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode));
#else
__asm__ __volatile__ ("fnstcw %0" : "=m" (queue[i].x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
#endif
}
#endif

View File

@ -278,12 +278,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else
#endif
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
#ifdef BUILD_DOUBLE
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif
} else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
#ifdef BUILD_SINGLE
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif
} else {
/* Other types in future */
}
@ -295,11 +298,15 @@ static DWORD WINAPI blas_thread_server(void *arg){
} else
#endif
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
#ifdef BUILD_COMPLEX16
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif
} else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
#ifdef BUILD_COMPLEX
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#endif
} else {
/* Other types in future */
}

View File

@ -1018,7 +1018,7 @@ static gotoblas_t *force_coretype(char *coretype){
char message[128];
//char mname[20];
for ( i=1 ; i <= 24; i++)
for ( i=1 ; i <= 25; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{

View File

@ -125,8 +125,13 @@ extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern gotoblas_t gotoblas_EMAG8180;
extern gotoblas_t gotoblas_NEOVERSEN1;
#ifndef NO_SVE
extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2;
#else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif
extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55;
#endif
@ -237,7 +242,7 @@ static gotoblas_t *get_coretype(void) {
p = (char *) NULL ;
infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
if (!infile) return NULL;
fgets(buffer, sizeof(buffer), infile);
(void)fgets(buffer, sizeof(buffer), infile);
midr_el1=strtoul(buffer,NULL,16);
fclose(infile);
#else
@ -274,10 +279,12 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_CORTEXA73;
case 0xd0c: // Neoverse N1
return &gotoblas_NEOVERSEN1;
#ifndef NO_SVE
case 0xd49:
return &gotoblas_NEOVERSEN2;
case 0xd40:
return &gotoblas_NEOVERSEV1;
#endif
case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55;
}

View File

@ -38,22 +38,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/resource.h>
#include "common.h"
#if (defined OS_LINUX || defined OS_ANDROID)
#include <asm/hwcap.h>
#include <sys/auxv.h>
#ifndef HWCAP_LOONGSON_CPUCFG
#define HWCAP_LOONGSON_CPUCFG (1 << 14)
#endif
#endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_MIPS64_GENERIC;
#ifdef DYN_LOONGSON3R3
extern gotoblas_t gotoblas_LOONGSON3R3;
#else
#define gotoblas_LOONGSON3R3 gotoblas_MIPS64_GENERIC
#endif
#ifdef DYN_LOONGSON3R4
extern gotoblas_t gotoblas_LOONGSON3R4;
#else
#define gotoblas_LOONGSON3R4 gotoblas_MIPS64_GENERIC
#endif
#else
extern gotoblas_t gotoblas_LOONGSON3R3;
extern gotoblas_t gotoblas_LOONGSON3R4;
extern gotoblas_t gotoblas_MIPS64_GENERIC;
#endif
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 2
#define NUM_CORETYPES 3
static char *corename[] = {
"MIPS64_GENERIC"
"loongson3r3",
"loongson3r4",
"UNKNOWN"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0];
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1];
if (gotoblas == &gotoblas_MIPS64_GENERIC) return corename[0];
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[1];
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[2];
return corename[NUM_CORETYPES];
}
@ -73,77 +99,32 @@ static gotoblas_t *force_coretype(char *coretype) {
switch (found)
{
case 0: return (&gotoblas_LOONGSON3R3);
case 1: return (&gotoblas_LOONGSON3R4);
case 0: return (&gotoblas_MIPS64_GENERIC);
case 1: return (&gotoblas_LOONGSON3R3);
case 2: return (&gotoblas_LOONGSON3R4);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
#if (defined OS_LINUX || defined OS_ANDROID)
#define MMI_MASK 0x00000010
#define MSA_MASK 0x00000020
int fd[2];
int support_cpucfg;
static void handler(int signum)
{
close(fd[1]);
exit(1);
}
/* Brief : Function to check if cpucfg supported on loongson
* Return: 1 supported
* 0 not supported
*/
static int cpucfg_test(void) {
pid_t pid;
int status = 0;
support_cpucfg = 0;
pipe(fd);
pid = fork();
if (pid == 0) { /* Subprocess */
struct sigaction act;
close(fd[0]);
/* Set signal action for SIGILL. */
act.sa_handler = handler;
sigaction(SIGILL,&act,NULL);
/* Execute cpucfg in subprocess. */
__asm__ volatile(
".insn \n\t"
".word (0xc8080118) \n\t"
:::
);
support_cpucfg = 1;
write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
close(fd[1]);
exit(0);
} else if (pid > 0){ /* Parent process*/
close(fd[1]);
if ((waitpid(pid,&status,0) <= 0) ||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
support_cpucfg = 0;
close(fd[0]);
} else {
support_cpucfg = 0;
}
return support_cpucfg;
}
static gotoblas_t *get_coretype_from_cpucfg(void) {
int flag = 0;
__asm__ volatile(
".set push \n\t"
".set noat \n\t"
".insn \n\t"
"dli $8, 0x01 \n\t"
".word (0xc9084918) \n\t"
"usw $9, 0x00(%0) \n\t"
"dli $1, 0x01 \n\t"
".word (0xc8080118) \n\t"
"move %0, $1 \n\t"
".set pop \n\t"
: "=r"(flag)
:
:
: "r"(&flag)
: "memory"
);
if (flag & MSA_MASK)
return (&gotoblas_LOONGSON3R4);
@ -153,7 +134,7 @@ static gotoblas_t *get_coretype_from_cpucfg(void) {
}
static gotoblas_t *get_coretype_from_cpuinfo(void) {
#ifdef linux
#ifdef __linux
FILE *infile;
char buffer[512], *p;
@ -176,17 +157,19 @@ static gotoblas_t *get_coretype_from_cpuinfo(void) {
return NULL;
}
#endif
return NULL;
return NULL;
}
#endif
static gotoblas_t *get_coretype(void) {
int ret = 0;
ret = cpucfg_test();
if (ret == 1)
return get_coretype_from_cpucfg();
else
return get_coretype_from_cpuinfo();
#if (!defined OS_LINUX && !defined OS_ANDROID)
return NULL;
#else
if (!(getauxval(AT_HWCAP) & HWCAP_LOONGSON_CPUCFG))
return get_coretype_from_cpucfg();
else
return get_coretype_from_cpuinfo();
#endif
}
void gotoblas_dynamic_init(void) {
@ -208,9 +191,9 @@ void gotoblas_dynamic_init(void) {
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
snprintf(coremsg, 128, "Falling back to MIPS64_GENEIRC\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_LOONGSON3R3;
gotoblas = &gotoblas_MIPS64_GENERIC;
}
if (gotoblas && gotoblas->init) {

View File

@ -823,6 +823,8 @@ void gotoblas_affinity_init(void) {
if (numprocs == 0) numprocs = readenv_atoi("OMP_NUM_THREADS");
if (numprocs == 0) numprocs = readenv_atoi("OPENBLAS_DEFAULT_NUM_THREADS");
numnodes = 1;
if (numprocs == 1) {

View File

@ -249,8 +249,11 @@ int get_num_procs(void) {
#if defined(USE_OPENMP)
#if _OPENMP >= 201511
int i,n;
n = 0;
ret = omp_get_num_places();
if (ret >0 ) nums = ret;
if (ret > 0) for (i=0; i<ret;i++) n+= omp_get_place_num_procs(i);
if (n > 0) nums = n;
#endif
return (nums > 0 ? nums : 2);
#endif
@ -419,6 +422,8 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}
@ -1820,8 +1825,11 @@ int get_num_procs(void) {
#if defined(USE_OPENMP)
/* if (omp_get_proc_bind() != omp_proc_bind_false) */
#if _OPENMP >= 201511
int i,n;
n = 0;
ret = omp_get_num_places();
if (ret >0 ) nums = ret;
if (ret > 0) for (i=0;i<ret;i++) n+= omp_get_place_num_procs(i);
if (n > 0) nums = n;
#endif
return (nums > 0 ? nums :2);
#endif
@ -1988,6 +1996,8 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;
}

View File

@ -283,6 +283,7 @@ The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/
int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) {
return blas_cpu_number;

View File

@ -67,10 +67,16 @@ void openblas_read_env() {
openblas_env_thread_timeout=(unsigned int)ret;
ret=0;
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
if (readenv(p,"OPENBLAS_DEFAULT_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
openblas_env_openblas_num_threads=ret;
ret=0;
if (readenv(p,"OPENBLAS_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;
if(ret != 0 || openblas_env_openblas_num_threads == 0)
openblas_env_openblas_num_threads=ret;
ret=0;
if (readenv(p,"GOTO_NUM_THREADS")) ret = atoi(p);
if(ret<0) ret=0;

View File

@ -4000,6 +4000,22 @@ case "$p1" in
no_underscore_objs="$no_underscore_objs $misc_common_objs"
printf 'int main(void){\n'
for obj in $underscore_objs; do
[ "$obj" != "xerbla" ] && printf 'extern void %s%s%s%s();\n' \
"$symbolprefix" "$obj" "$bu" "$symbolsuffix"
done
for obj in $need_2underscore_objs; do
printf 'extern void %s%s%s%s%s();\n' \
"$symbolprefix" "$obj" "$bu" "$bu" "$symbolsuffix"
done
for obj in $no_underscore_objs; do
printf 'extern void %s%s%s();\n' \
"$symbolprefix" "$obj" "$symbolsuffix"
done
printf '\n'
for obj in $underscore_objs; do
[ "$obj" != "xerbla" ] && printf '%s%s%s%s();\n' \
"$symbolprefix" "$obj" "$bu" "$symbolsuffix"

View File

@ -3955,6 +3955,18 @@ if ($ARGV[0] eq "linktest") {
@no_underscore_objs = (@no_underscore_objs, @misc_common_objs);
print "int main(void){\n";
foreach $objs (@underscore_objs) {
print "extern void ", $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla";
}
foreach $objs (@need_2underscore_objs) {
print "extern void ", $symbolprefix, $objs, $bu, $bu, $symbolsuffix, "();\n";
}
foreach $objs (@no_underscore_objs) {
print "extern void ", $symbolprefix, $objs, $symbolsuffix, "();\n";
}
foreach $objs (@underscore_objs) {
print $symbolprefix, $objs, $bu, $symbolsuffix, "();\n" if $objs ne "xerbla";
}

View File

@ -82,7 +82,7 @@ else
vendor=FUJITSU
openmp='-Kopenmp'
;;
*Cray*)
*Hewlett*)
vendor=CRAY
openmp='-fopenmp'
;;
@ -102,7 +102,7 @@ else
vendor=FLANG
openmp='-fopenmp'
;;
*ifx*)
*ifort*|*ifx*)
vendor=INTEL
openmp='-fopenmp'
;;
@ -117,6 +117,10 @@ else
esac
fi
;;
*Cray*)
vendor=CRAY
openmp='-fopenmp'
;;
*g95*)
vendor=G95
openmp=''

View File

@ -76,11 +76,6 @@ if ($compiler eq "") {
$vendor = FUJITSU;
$openmp = "-Kopenmp";
} elsif ($data =~ /Cray/) {
$vendor = CRAY;
$openmp = "-fopenmp";
} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
$data =~ s/\(+.*?\)+//g;
@ -95,7 +90,7 @@ if ($compiler eq "") {
if ($compiler =~ /flang/) {
$vendor = FLANG;
$openmp = "-fopenmp";
} elsif ($compiler =~ /ifx/) {
} elsif ($compiler =~ /ifort/ || $compiler =~ /ifx/) {
$vendor = INTEL;
$openmp = "-fopenmp";
} elsif ($compiler =~ /pgf/ || $compiler =~ /nvf/) {
@ -106,6 +101,10 @@ if ($compiler eq "") {
$openmp = "";
}
}
} elsif ($data =~ /Cray/) {
$vendor = CRAY;
$openmp = "-fopenmp";
}

View File

@ -131,6 +131,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440 */
/* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */
/* #define FORCE_MIPS64_GENERIC */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
@ -146,6 +147,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SPARCV7 */
/* #define FORCE_ZARCH_GENERIC */
/* #define FORCE_Z13 */
/* #define FORCE_EV4 */
/* #define FORCE_EV5 */
/* #define FORCE_EV6 */
/* #define FORCE_GENERIC */
#ifdef FORCE_P2
@ -915,6 +919,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "CELL"
#endif
#ifdef FORCE_MIPS64_GENERIC
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "MIPS64_GENERIC"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DMIPS64_GENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "mips64_generic"
#define CORENAME "MIPS64_GENERIC"
#else
#endif
#ifdef FORCE_SICORTEX
#define FORCE
#define ARCHITECTURE "MIPS"
@ -951,7 +969,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DLOONGSON3R4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 -DHAVE_MSA"
#define LIBNAME "loongson3r4"
#define CORENAME "LOONGSON3R4"
#else
@ -965,7 +983,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DLOONGSON3R5 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
#else
@ -979,7 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DLOONGSON2K1000 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongson2k1000"
#define CORENAME "LOONGSON2K1000"
#else
@ -993,7 +1011,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DLOONGSONGENERIC " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 -DHAVE_MSA"
#define LIBNAME "loongsongeneric"
#define CORENAME "LOONGSONGENERIC"
#else
@ -1007,7 +1025,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DI6400 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DHAVE_MSA "
#define LIBNAME "i6400"
#define CORENAME "I6400"
#else
@ -1035,7 +1053,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DP5600 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8"
#define LIBNAME "p5600"
#define CORENAME "P5600"
#else
@ -1049,7 +1067,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS1004K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8"
#define LIBNAME "mips1004K"
#define CORENAME "MIPS1004K"
#else
@ -1063,7 +1081,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DMIPS24K " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=32768 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DNO_MSA"
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8"
#define LIBNAME "mips24K"
#define CORENAME "MIPS24K"
#else
@ -1077,7 +1095,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ARCHCONFIG "-DI6500 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 -DHAVE_MSA"
#define LIBNAME "i6500"
#define CORENAME "I6500"
#else
@ -1392,7 +1410,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DHAVE_SVE -DARMV8 " \
"-march=armv8.4-a -mtune=neoverse-v1"
"-march=armv8.4-a+sve -mtune=neoverse-v1"
#define LIBNAME "neoversev1"
#define CORENAME "NEOVERSEV1"
#endif
@ -1601,6 +1619,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z14"
#endif
#ifdef FORCE_EV4
#define FORCE
#define ARCHITECTURE "ALPHA"
#define SUBARCHITECTURE "ev4"
#define ARCHCONFIG "-DEV4 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=8192 "
#define LIBNAME "ev4"
#define CORENAME "EV4"
#endif
#ifdef FORCE_EV5
#define FORCE
#define ARCHITECTURE "ALPHA"
#define SUBARCHITECTURE "ev5"
#define ARCHCONFIG "-DEV5 " \
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=8192 "
#define LIBNAME "ev5"
#define CORENAME "EV5"
#endif
#ifdef FORCE_EV6
#define FORCE
#define ARCHITECTURE "ALPHA"
#define SUBARCHITECTURE "ev6"
#define ARCHCONFIG "-DEV6 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=8192 "
#define LIBNAME "ev6"
#define CORENAME "EV6"
#endif
#ifdef FORCE_C910V
#define FORCE
#define ARCHITECTURE "RISCV64"
@ -1777,7 +1831,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
printf("CORE=%s\n", get_corename());
#endif
#endif

View File

@ -53,7 +53,7 @@ set(BLAS2_COMPLEX_ONLY_MANGLED_SOURCES
# these do not have separate 'z' sources
set(BLAS3_SOURCES
gemm.c symm.c
trsm.c syrk.c syr2k.c
trsm.c syrk.c syr2k.c gemmt.c
)
set(BLAS3_MANGLED_SOURCES
@ -189,7 +189,16 @@ if (NOT DEFINED NO_LAPACK)
)
GenerateNamedObjects("${LAPACK_SOURCES}")
if (NOT RELAPACK_REPLACE)
GenerateNamedObjects("${LAPACK_MANGLED_SOURCES}" "" "" 0 "" "" 0 3)
else ()
GenerateNamedObjects("lapack/getrs.c" "" "" 0 "" "" 0 3)
GenerateNamedObjects("lapack/getf2.c" "" "" 0 "" "" 0 3)
GenerateNamedObjects("lapack/potf2.c" "" "" 0 "" "" 0 3)
GenerateNamedObjects("lapack/laswp.c" "" "" 0 "" "" 0 3)
GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3)
GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3)
endif()
endif ()
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE)

View File

@ -44,12 +44,12 @@ SBLAS3OBJS = \
sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \
strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
sgeadd.$(SUFFIX)
sgeadd.$(SUFFIX) sgemmt.$(SUFFIX)
ifeq ($(BUILD_BFLOAT16),1)
SBBLAS1OBJS = sbdot.$(SUFFIX)
SBBLAS2OBJS = sbgemv.$(SUFFIX)
SBBLAS3OBJS = sbgemm.$(SUFFIX)
SBBLAS3OBJS = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX)
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif
@ -76,7 +76,7 @@ DBLAS3OBJS = \
dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \
dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \
domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\
dgeadd.$(SUFFIX)
dgeadd.$(SUFFIX) dgemmt.$(SUFFIX)
CBLAS1OBJS = \
caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \
@ -92,8 +92,9 @@ CBLAS2OBJS = \
cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \
ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \
csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \
csbmv.$(SUFFIX) \
cspr2.$(SUFFIX) \
csbmv.$(SUFFIX) cspmv.$(SUFFIX) \
cspr.$(SUFFIX) cspr2.$(SUFFIX) \
csymv.$(SUFFIX) csyr.$(SUFFIX) \
ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \
ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \
chemv.$(SUFFIX) chbmv.$(SUFFIX) \
@ -105,7 +106,7 @@ CBLAS3OBJS = \
ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \
chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \
comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\
cgeadd.$(SUFFIX)
cgeadd.$(SUFFIX) cgemmt.$(SUFFIX)
ZBLAS1OBJS = \
zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \
@ -121,8 +122,9 @@ ZBLAS2OBJS = \
zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \
ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \
zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \
zsbmv.$(SUFFIX) \
zspr2.$(SUFFIX) \
zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \
zspr.$(SUFFIX) zspr2.$(SUFFIX) \
zsymv.$(SUFFIX) zsyr.$(SUFFIX) \
ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \
ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \
zhemv.$(SUFFIX) zhbmv.$(SUFFIX) \
@ -134,7 +136,7 @@ ZBLAS3OBJS = \
ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \
zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \
zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\
zgeadd.$(SUFFIX)
zgeadd.$(SUFFIX) zgemmt.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
@ -281,12 +283,12 @@ CSBLAS2OBJS = \
CSBLAS3OBJS = \
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
cblas_sgeadd.$(SUFFIX)
cblas_sgeadd.$(SUFFIX) cblas_sgemmt.$(SUFFIX)
ifeq ($(BUILD_BFLOAT16),1)
CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX)
CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif
@ -306,7 +308,7 @@ CDBLAS2OBJS = \
CDBLAS3OBJS += \
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
cblas_dgeadd.$(SUFFIX)
cblas_dgeadd.$(SUFFIX) cblas_dgemmt.$(SUFFIX)
CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
@ -331,7 +333,7 @@ CCBLAS3OBJS = \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
cblas_cgeadd.$(SUFFIX)
cblas_cgeadd.$(SUFFIX) cblas_cgemmt.$(SUFFIX)
CXERBLAOBJ = \
cblas_xerbla.$(SUFFIX)
@ -362,7 +364,7 @@ CZBLAS3OBJS = \
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
cblas_zgeadd.$(SUFFIX)
cblas_zgeadd.$(SUFFIX) cblas_zgemmt.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1)
@ -1300,6 +1302,8 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
ifeq ($(BUILD_BFLOAT16),1)
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sbgemmt.$(SUFFIX) sbgemm.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
endif
sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
@ -1320,6 +1324,24 @@ zgemm.$(SUFFIX) zgemm.$(PSUFFIX) : gemm.c ../param.h
xgemm.$(SUFFIX) xgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
sgemmt.$(SUFFIX) sgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
dgemmt.$(SUFFIX) dgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
qgemmt.$(SUFFIX) qgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
cgemmt.$(SUFFIX) cgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
zgemmt.$(SUFFIX) zgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
xgemmt.$(SUFFIX) xgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F)
ssymm.$(SUFFIX) ssymm.$(PSUFFIX) : symm.c
$(CC) -c $(CFLAGS) $< -o $(@F)
@ -1907,6 +1929,23 @@ cblas_cgemm.$(SUFFIX) cblas_cgemm.$(PSUFFIX) : gemm.c ../param.h
cblas_zgemm.$(SUFFIX) cblas_zgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif
cblas_dgemmt.$(SUFFIX) cblas_dgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
cblas_cgemmt.$(SUFFIX) cblas_cgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
cblas_zgemmt.$(SUFFIX) cblas_zgemmt.$(PSUFFIX) : gemmt.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
cblas_ssymm.$(SUFFIX) cblas_ssymm.$(PSUFFIX) : symm.c
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

589
interface/gemmt.c Normal file
View File

@ -0,0 +1,589 @@
/*********************************************************************/
/* Copyright 2022, The OpenBLAS Project. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.0
#ifdef XDOUBLE
#define ERROR_NAME "QGEMT "
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMT "
#elif defined(BFLOAT16)
#define ERROR_NAME "SBGEMT "
#else
#define ERROR_NAME "SGEMT "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.0
#ifdef XDOUBLE
#define ERROR_NAME "XGEMT "
#elif defined(DOUBLE)
#define ERROR_NAME "ZGEMT "
#else
#define ERROR_NAME "CGEMT "
#endif
#endif
#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
#ifndef CBLAS
void NAME(char *UPLO, char *TRANSA, char *TRANSB,
blasint * M, blasint * N, blasint * K,
FLOAT * Alpha,
IFLOAT * a, blasint * ldA,
IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
{
blasint m, n, k;
blasint lda, ldb, ldc;
int transa, transb, uplo;
blasint info;
char transA, transB, Uplo;
IFLOAT *buffer;
IFLOAT *aa, *bb;
FLOAT *cc;
#if defined(COMPLEX)
FLOAT alpha_r, alpha_i, beta_r, beta_i;
#else
FLOAT alpha, beta;
#endif
PRINT_DEBUG_NAME;
m = *M;
n = *N;
k = *K;
#if defined(COMPLEX)
FLOAT *alpha = Alpha;
alpha_r = *(Alpha + 0);
alpha_i = *(Alpha + 1);
beta_r = *(Beta + 0);
beta_i = *(Beta + 1);
#else
alpha = *Alpha;
beta = *Beta;
#endif
lda = *ldA;
ldb = *ldB;
ldc = *ldC;
transA = *TRANSA;
transB = *TRANSB;
Uplo = *UPLO;
TOUPPER(transA);
TOUPPER(transB);
TOUPPER(Uplo);
transa = -1;
transb = -1;
uplo = -1;
if (transA == 'N')
transa = 0;
if (transA == 'T')
transa = 1;
#ifndef COMPLEX
if (transA == 'R')
transa = 0;
if (transA == 'C')
transa = 1;
#else
if (transA == 'R')
transa = 2;
if (transA == 'C')
transa = 3;
#endif
if (transB == 'N')
transb = 0;
if (transB == 'T')
transb = 1;
#ifndef COMPLEX
if (transB == 'R')
transb = 0;
if (transB == 'C')
transb = 1;
#else
if (transB == 'R')
transb = 2;
if (transB == 'C')
transb = 3;
#endif
if (Uplo == 'U')
uplo = 0;
if (Uplo == 'L')
uplo = 1;
info = 0;
if (uplo < 0)
info = 14;
if (ldc < m)
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
info = 2;
if (transa < 0)
info = 1;
if (info) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M,
blasint N, blasint k,
#ifndef COMPLEX
FLOAT alpha,
IFLOAT * A, blasint LDA,
IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
{
#else
void *valpha,
void *va, blasint LDA,
void *vb, blasint LDB, void *vbeta, void *vc, blasint ldc)
{
FLOAT *alpha = (FLOAT *) valpha;
FLOAT *beta = (FLOAT *) vbeta;
FLOAT *A = (FLOAT *) va;
FLOAT *B = (FLOAT *) vb;
FLOAT *c = (FLOAT *) vc;
#endif
FLOAT *aa, *bb, *cc;
int transa, transb, uplo;
blasint info;
blasint m, n, lda, ldb;
FLOAT *a, *b;
XFLOAT *buffer;
PRINT_DEBUG_CNAME;
transa = -1;
transb = -1;
info = 0;
if (order == CblasColMajor) {
if (TransA == CblasNoTrans)
transa = 0;
if (TransA == CblasTrans)
transa = 1;
#ifndef COMPLEX
if (TransA == CblasConjNoTrans)
transa = 0;
if (TransA == CblasConjTrans)
transa = 1;
#else
if (TransA == CblasConjNoTrans)
transa = 2;
if (TransA == CblasConjTrans)
transa = 3;
#endif
if (TransB == CblasNoTrans)
transb = 0;
if (TransB == CblasTrans)
transb = 1;
#ifndef COMPLEX
if (TransB == CblasConjNoTrans)
transb = 0;
if (TransB == CblasConjTrans)
transb = 1;
#else
if (TransB == CblasConjNoTrans)
transb = 2;
if (TransB == CblasConjTrans)
transb = 3;
#endif
m = M;
n = N;
a = (void *)A;
b = (void *)B;
lda = LDA;
ldb = LDB;
info = -1;
if (ldc < m)
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
info = 2;
if (transa < 0)
info = 1;
}
if (order == CblasRowMajor) {
m = N;
n = M;
a = (void *)B;
b = (void *)A;
lda = LDB;
ldb = LDA;
if (TransB == CblasNoTrans)
transa = 0;
if (TransB == CblasTrans)
transa = 1;
#ifndef COMPLEX
if (TransB == CblasConjNoTrans)
transa = 0;
if (TransB == CblasConjTrans)
transa = 1;
#else
if (TransB == CblasConjNoTrans)
transa = 2;
if (TransB == CblasConjTrans)
transa = 3;
#endif
if (TransA == CblasNoTrans)
transb = 0;
if (TransA == CblasTrans)
transb = 1;
#ifndef COMPLEX
if (TransA == CblasConjNoTrans)
transb = 0;
if (TransA == CblasConjTrans)
transb = 1;
#else
if (TransA == CblasConjNoTrans)
transb = 2;
if (TransA == CblasConjTrans)
transb = 3;
#endif
info = -1;
if (ldc < m)
info = 13;
if (k < 0)
info = 5;
if (n < 0)
info = 4;
if (m < 0)
info = 3;
if (transb < 0)
info = 2;
if (transa < 0)
info = 1;
}
uplo = -1;
if (Uplo == CblasUpper)
uplo = 0;
if (Uplo == CblasLower)
uplo = 1;
if (uplo < 0)
info = 14;
if (info >= 0) {
BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#if defined(COMPLEX)
FLOAT alpha_r = *(alpha + 0);
FLOAT alpha_i = *(alpha + 1);
FLOAT beta_r = *(beta + 0);
FLOAT beta_i = *(beta + 1);
#endif
#endif
int buffer_size;
blasint l;
blasint i, j;
#ifdef SMP
int nthreads;
#endif
#if defined(COMPLEX)
#ifdef SMP
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT *, FLOAT *,
BLASLONG, FLOAT *, BLASLONG, FLOAT *,
BLASLONG, FLOAT *, int) = {
#ifdef XDOUBLE
xgemv_thread_n, xgemv_thread_t, xgemv_thread_r, xgemv_thread_c,
xgemv_thread_o, xgemv_thread_u, xgemv_thread_s,
xgemv_thread_d,
#elif defined DOUBLE
zgemv_thread_n, zgemv_thread_t, zgemv_thread_r, zgemv_thread_c,
zgemv_thread_o, zgemv_thread_u, zgemv_thread_s,
zgemv_thread_d,
#else
cgemv_thread_n, cgemv_thread_t, cgemv_thread_r, cgemv_thread_c,
cgemv_thread_o, cgemv_thread_u, cgemv_thread_s,
cgemv_thread_d,
#endif
};
#endif
int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT, FLOAT *,
BLASLONG, FLOAT *, BLASLONG, FLOAT *, BLASLONG,
FLOAT *) = {
GEMV_N, GEMV_T, GEMV_R, GEMV_C, GEMV_O, GEMV_U, GEMV_S, GEMV_D,};
#else
#ifdef SMP
static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, FLOAT *,
BLASLONG, FLOAT *, BLASLONG, FLOAT *,
BLASLONG, FLOAT *, int) = {
#ifdef XDOUBLE
qgemv_thread_n, qgemv_thread_t,
#elif defined DOUBLE
dgemv_thread_n, dgemv_thread_t,
#else
sgemv_thread_n, sgemv_thread_t,
#endif
};
#endif
int (*gemv[]) (BLASLONG, BLASLONG, BLASLONG, FLOAT, FLOAT *, BLASLONG,
FLOAT *, BLASLONG, FLOAT *, BLASLONG, FLOAT *) = {
GEMV_N, GEMV_T,};
#endif
if ((m == 0) || (n == 0))
return;
IDEBUG_START;
FUNCTION_PROFILE_START();
const blasint incb = (transb == 0) ? 1 : ldb;
if (uplo == 1) {
for (i = 0; i < n; i++) {
j = n - i;
l = j;
#if defined(COMPLEX)
aa = a + i * 2;
bb = b + i * ldb * 2;
if (transa) {
l = k;
aa = a + lda * i * 2;
bb = b + i * 2;
}
cc = c + i * 2 * ldc + i * 2;
#else
aa = a + i;
bb = b + i * ldb;
if (transa) {
l = k;
aa = a + lda * i;
bb = b + i;
}
cc = c + i * ldc + i;
#endif
#if defined(COMPLEX)
if (beta_r != ONE || beta_i != ZERO)
SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO)
return;
#else
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
#if defined(COMPLEX)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
aa, lda, bb, incb, cc, 1,
buffer);
#else
(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
bb, incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
(gemv_thread[(int)transa]) (j, k, alpha, aa,
lda, bb, incb, cc,
1, buffer,
nthreads);
}
#endif
STACK_FREE(buffer);
}
} else {
for (i = 0; i < n; i++) {
j = i + 1;
l = j;
#if defined COMPLEX
bb = b + i * ldb * 2;
if (transa) {
l = k;
bb = b + i * 2;
}
cc = c + i * 2 * ldc;
#else
bb = b + i * ldb;
if (transa) {
l = k;
bb = b + i;
}
cc = c + i * ldc;
#endif
#if defined(COMPLEX)
if (beta_r != ONE || beta_i != ZERO)
SCAL_K(l, 0, 0, beta_r, beta_i, cc, 1, NULL, 0,
NULL, 0);
if (alpha_r == ZERO && alpha_i == ZERO)
return;
#else
if (beta != ONE)
SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
if (alpha == ZERO)
continue;
#endif
IDEBUG_START;
FUNCTION_PROFILE_START();
buffer_size = j + k + 128 / sizeof(FLOAT);
#ifdef WINDOWS_ABI
buffer_size += 160 / sizeof(FLOAT);
#endif
// for alignment
buffer_size = (buffer_size + 3) & ~3;
STACK_ALLOC(buffer_size, FLOAT, buffer);
#ifdef SMP
if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
#if defined(COMPLEX)
(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
a, lda, bb, incb, cc, 1,
buffer);
#else
(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
incb, cc, 1, buffer);
#endif
#ifdef SMP
} else {
(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
bb, incb, cc, 1,
buffer, nthreads);
}
#endif
STACK_FREE(buffer);
}
}
FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE,
args.m * args.k + args.k * args.n +
args.m * args.n, 2 * args.m * args.n * args.k);
IDEBUG_END;
return;
}

View File

@ -44,6 +44,7 @@
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 65536.
#ifdef XDOUBLE
#define ERROR_NAME "QSYMM "
#elif defined(DOUBLE)
@ -52,6 +53,7 @@
#define ERROR_NAME "SSYMM "
#endif
#else
#define SMP_THRESHOLD_MIN 8192.
#ifndef GEMM3M
#ifndef HEMM
#ifdef XDOUBLE
@ -91,6 +93,10 @@
#endif
#endif
#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
#ifdef SMP
#ifndef COMPLEX
@ -159,7 +165,9 @@ void NAME(char *SIDE, char *UPLO,
#if defined(SMP) && !defined(NO_AFFINITY)
int nodes;
#endif
# if defined(SMP)
int MN;
#endif
blasint info;
int side;
int uplo;
@ -255,6 +263,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#if defined(SMP) && !defined(NO_AFFINITY)
int nodes;
#endif
#if defined(SMP)
int MN;
#endif
PRINT_DEBUG_CNAME;
@ -375,15 +386,18 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#ifdef SMP
args.common = NULL;
args.nthreads = num_cpu_avail(3);
MN = 2.* (double) args.m * (double)args.m * (double) args.n;
if (MN <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) {
args.nthreads = 1;
} else {
args.nthreads = num_cpu_avail(3);
}
if (args.nthreads == 1) {
#endif
(symm[(side << 1) | uplo ])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP
} else {
#ifndef NO_AFFINITY

View File

@ -180,6 +180,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
if (n <200)
nthreads=1;
else
nthreads = num_cpu_avail(2);
if (nthreads == 1) {

View File

@ -368,6 +368,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
mode |= (uplo << BLAS_UPLO_SHIFT);
args.common = NULL;
if (args.n*args.k <1000)
args.nthreads =1 ;
else
args.nthreads = num_cpu_avail(3);
if (args.nthreads == 1) {

View File

@ -44,6 +44,7 @@
#endif
#ifndef COMPLEX
#define SMP_THRESHOLD_MIN 109944.
#ifdef XDOUBLE
#define ERROR_NAME "QSYRK "
#elif defined(DOUBLE)
@ -52,6 +53,7 @@
#define ERROR_NAME "SSYRK "
#endif
#else
#define SMP_THRESHOLD_MIN 14824.
#ifndef HEMM
#ifdef XDOUBLE
#define ERROR_NAME "XSYRK "
@ -71,6 +73,10 @@
#endif
#endif
#ifndef GEMM_MULTITHREAD_THRESHOLD
#define GEMM_MULTITHREAD_THRESHOLD 4
#endif
static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef HEMM
SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC,
@ -101,6 +107,7 @@ void NAME(char *UPLO, char *TRANS,
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX
#ifdef XDOUBLE
@ -225,6 +232,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
FLOAT *sa, *sb;
#ifdef SMP
int NNK;
#ifdef USE_SIMPLE_THREADED_LEVEL3
#ifndef COMPLEX
#ifdef XDOUBLE
@ -354,18 +363,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
#endif
args.common = NULL;
#ifndef COMPLEX
#ifdef DOUBLE
if (args.n < 100)
#else
if (args.n < 200)
#endif
#else
if (args.n < 65)
#endif
NNK = (double)(args.n+1)*(double)args.n*(double)args.k;
if (NNK <= (SMP_THRESHOLD_MIN * GEMM_MULTITHREAD_THRESHOLD)) {
args.nthreads = 1;
else
} else {
args.nthreads = num_cpu_avail(3);
}
if (args.nthreads == 1) {
#endif
@ -373,7 +377,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Tr
(syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0);
#ifdef SMP
} else {
#ifndef USE_SIMPLE_THREADED_LEVEL3

View File

@ -181,7 +181,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
x, 1, a, 1, NULL, 0);
}
a += lda;
a += lda * 2;
}
} else {
for (i = 0; i < n; i++){
@ -191,7 +191,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO
alpha_i * x[i * 2 + 0] + alpha_r * x[i * 2 + 1],
x + i * 2, 1, a, 1, NULL, 0);
}
a += 2 + lda;
a += 2 + lda * 2;
}
}
return;

View File

@ -237,8 +237,74 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
if (DGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE")
endif ()
GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE")
endif ()
GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE")
GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE")
GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE")
if (SMALL_MATRIX_OPT)
if (NOT DEFINED DGEMM_SMALL_M_PERMIT)
set(DGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_NN)
set(DGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_NT)
set(DGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_TN)
set(DGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_TT)
set(DGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_NN)
set(DGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_NT)
set(DGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_TN)
set(DGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_TT)
set(DGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "DOUBLE")
endif ()
endif ()
if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SGEMMKERNEL}" "" "gemm_kernel" false "" "" false "SINGLE")
if (SGEMMINCOPY)
@ -825,7 +891,7 @@ endif ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type})
endforeach ()
if (BUILD_DOUBLE AND NOT BUILD_SINGLE)
if ((BUILD_DOUBLE OR BUILD_COMPLEX) AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LN}" "UPPER;LN;TRSMKERNEL" "trsm_kernel_LN" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_LT}" "LT;TRSMKERNEL" "trsm_kernel_LT" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${STRSMKERNEL_RN}" "UPPER;RN;TRSMKERNEL" "trsm_kernel_RN" false "" "" false "SINGLE")
@ -849,6 +915,45 @@ endif ()
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_M}.c" "LOWER" "trsm_iltncopy" false "" "" false "SINGLE")
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER;UNIT" "trsm_oltucopy" false "" "" false "SINGLE")
GenerateNamedObjects("generic/trsm_ltcopy_${SGEMM_UNROLL_N}.c" "OUTER;LOWER" "trsm_oltncopy" false "" "" false "SINGLE")
if (SMALL_MATRIX_OPT)
if (NOT DEFINED SGEMM_SMALL_M_PERMIT)
set(SGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_NN)
set(SGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_NT)
set(SGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_TN)
set(SGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_TT)
set(SGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_B0_NN)
set(SGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_B0_NT)
set(SGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_B0_TN)
set(SGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED SGEMM_SMALL_K_B0_TT)
set(SGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "SINGLE")
endif ()
endif ()
# Makefile.LA
@ -878,25 +983,25 @@ endif ()
endforeach()
if (BUILD_COMPLEX AND NOT BUILD_SINGLE)
if (NOT DEFINED SNEG_TCOPY)
set(SNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
set(SNEG_TCOPY ../generic/neg_tcopy_${SGEMM_UNROLL_M}.c)
endif ()
if (NOT DEFINED SLASWP_NCOPY)
set(SLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
set(SLASWP_NCOPY ../generic/laswp_ncopy_${SGEMM_UNROLL_N}.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${SNEG_TCOPY}" "" "neg_tcopy" false "" "" false "SINGLE")
GenerateNamedObjects("${KERNELDIR}/${SLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "SINGLE")
endif()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
if (NOT DEFINED DNEG_TCOPY)
set(DNEG_TCOPY ../generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c)
set(DNEG_TCOPY ../generic/neg_tcopy_${DGEMM_UNROLL_M}.c)
endif ()
if (NOT DEFINED DLASWP_NCOPY)
set(DLASWP_NCOPY ../generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c)
set(DLASWP_NCOPY ../generic/laswp_ncopy_${DGEMM_UNROLL_N}.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DNEG_TCOPY}" "" "neg_tcopy" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DLASWP_NCOPY}" "" "laswp_ncopy" false "" "" false "DOUBLE")
endif()
endif()
@ -979,10 +1084,117 @@ endif ()
endif ()
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE)
GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DASUMKERNEL}" "" "asum_k" false "" "" false "DOUBLE")
if (DEFINED DMAXKERNEL)
GenerateNamedObjects("${KERNELDIR}/${DMAXKERNEL}" "" "max_k" false "" "" false "DOUBLE")
endif ()
if (DEFINED DMINKERNEL)
GenerateNamedObjects("${KERNELDIR}/${DMINKERNEL}" "USE_MIN" "min_k" false "" "" false "DOUBLE")
endif ()
if (DEFINED IDMINKERNEL)
GenerateNamedObjects("${KERNELDIR}/${IDMINKERNEL}" "USE_MIN" "i*min_k" false "" "" false "DOUBLE")
endif ()
if (DEFINED IDMAXKERNEL)
GenerateNamedObjects("${KERNELDIR}/${IDMAXKERNEL}" "" "i*max_k" false "" "" false "DOUBLE")
endif ()
GenerateNamedObjects("${KERNELDIR}/${IDAMAXKERNEL}" "USE_ABS" "i*amax_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${IDAMINKERNEL}" "USE_ABS;USE_MIN" "i*amin_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DSCALKERNEL}" "" "scal_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMVNKERNEL}" "" "gemv_n" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMVTKERNEL}" "TRANS" "gemv_t" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE")
if (DGEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE")
endif ()
if (DGEMMITCOPY)
GenerateNamedObjects("${KERNELDIR}/${DGEMMITCOPY}" "DOUBLE" "${DGEMMITCOPYOBJ}" false "" "" true "DOUBLE")
endif ()
if (DGEMMONCOPY)
GenerateNamedObjects("${KERNELDIR}/${DGEMMONCOPY}" "DOUBLE" "${DGEMMONCOPYOBJ}" false "" "" true "DOUBLE")
endif ()
if (DGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${DGEMMOTCOPY}" "DOUBLE" "${DGEMMOTCOPYOBJ}" false "" "" true "DOUBLE")
endif ()
GenerateNamedObjects("${KERNELDIR}/${DGEMM_BETA}" "" "gemm_beta" false "" "" false "DOUBLE")
GenerateNamedObjects("generic/neg_tcopy_${DGEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false "DOUBLE")
GenerateNamedObjects("generic/laswp_ncopy_${DGEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false "DOUBLE")
if (SMALL_MATRIX_OPT)
if (NOT DEFINED DGEMM_SMALL_M_PERMIT)
set(DGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_NN)
set(DGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_NT)
set(DGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_TN)
set(DGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_TT)
set(DGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_NN)
set(DGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_NT)
set(DGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_TN)
set(DGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
endif ()
if (NOT DEFINED DGEMM_SMALL_K_B0_TT)
set(DGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
endif ()
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NN" "gemm_small_kernel_nn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "NR" "gemm_small_kernel_nr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RN" "gemm_small_kernel_rn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NN}" "RR" "gemm_small_kernel_rr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NT" "gemm_small_kernel_nt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "NC" "gemm_small_kernel_nc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RT" "gemm_small_kernel_rt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_NT}" "RC" "gemm_small_kernel_rc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TN" "gemm_small_kernel_tn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "TR" "gemm_small_kernel_tr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CN" "gemm_small_kernel_cn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TN}" "CR" "gemm_small_kernel_cr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TT" "gemm_small_kernel_tt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "TC" "gemm_small_kernel_tc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CT" "gemm_small_kernel_ct" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_TT}" "CC" "gemm_small_kernel_cc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NN}" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_NT}" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TN}" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "DOUBLE")
GenerateNamedObjects("${KERNELDIR}/${DGEMM_SMALL_K_B0_TT}" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "DOUBLE")
endif ()
endif ()
if (BUILD_COMPLEX16 AND NOT BUILD_COMPLEX)
if (BUILD_COMPLEX16 AND NOT BUILD_SINGLE)
GenerateNamedObjects("${KERNELDIR}/${SSCALKERNEL}" "" "scal_k" false "" "" false "SINGLE")
endif()
if (BUILD_COMPLEX160 AND NOT BUILD_COMPLEX)
GenerateNamedObjects("${KERNELDIR}/${CAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CAMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false "COMPLEX")
if (DEFINED CMAXKERNEL)
@ -1046,7 +1258,69 @@ endif ()
if (CGEMMOTCOPY)
GenerateNamedObjects("${KERNELDIR}/${CGEMMOTCOPY}" "COMPLEX" "${CGEMMOTCOPYOBJ}" false "" "" true "COMPLEX")
endif ()
GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_BETA}" "" "gemm_beta" false "" "" false "COMPLEX")
if (SMALL_MATRIX_OPT)
if (NOT DEFINED CGEMM_SMALL_M_PERMIT)
set(CGEMM_SMALL_M_PERMIT ../generic/zgemm_small_matrix_permit)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_NN)
set(CGEMM_SMALL_K_NN ../generic/zgemm_small_matrix_kernel_nn)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_NT)
set(CGEMM_SMALL_K_NT ../generic/zgemm_small_matrix_kernel_nt)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_TN)
set(CGEMM_SMALL_K_TN ../generic/zgemm_small_matrix_kernel_tn)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_TT)
set(CGEMM_SMALL_K_TT ../generic/zgemm_small_matrix_kernel_tt)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_B0_NN)
set(CGEMM_SMALL_K_B0_NN ../generic/zgemm_small_matrix_kernel_nn)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_B0_NT)
set(CGEMM_SMALL_K_B0_NT ../generic/zgemm_small_matrix_kernel_nt)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_B0_TN)
set(CGEMM_SMALL_K_B0_TN ../generic/zgemm_small_matrix_kernel_tn)
endif ()
if (NOT DEFINED CGEMM_SMALL_K_B0_TT)
set(CGEMM_SMALL_K_B0_TT ../generic/zgemm_small_matrix_kernel_tt)
endif ()
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_M_PERMIT}.c" "" "gemm_small_matrix_permit" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "NN" "gemm_small_kernel_nn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "NR" "gemm_small_kernel_nr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "RN" "gemm_small_kernel_rn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NN}.c" "RR" "gemm_small_kernel_rr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "NT" "gemm_small_kernel_nt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "NC" "gemm_small_kernel_nc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "RT" "gemm_small_kernel_rt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_NT}.c" "RC" "gemm_small_kernel_rc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "TN" "gemm_small_kernel_tn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "TR" "gemm_small_kernel_tr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "CN" "gemm_small_kernel_cn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TN}.c" "CR" "gemm_small_kernel_cr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "TT" "gemm_small_kernel_tt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "TC" "gemm_small_kernel_tc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "CT" "gemm_small_kernel_ct" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_TT}.c" "CC" "gemm_small_kernel_cc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "NN;B0" "gemm_small_kernel_b0_nn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "NR;B0" "gemm_small_kernel_b0_nr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "RN;B0" "gemm_small_kernel_b0_rn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NN}.c" "RR;B0" "gemm_small_kernel_b0_rr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "NT;B0" "gemm_small_kernel_b0_nt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "NC;B0" "gemm_small_kernel_b0_nc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "RT;B0" "gemm_small_kernel_b0_rt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_NT}.c" "RC;B0" "gemm_small_kernel_b0_rc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "TN;B0" "gemm_small_kernel_b0_tn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "TR;B0" "gemm_small_kernel_b0_tr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "CN;B0" "gemm_small_kernel_b0_cn" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TN}.c" "CR;B0" "gemm_small_kernel_b0_cr" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "TT;B0" "gemm_small_kernel_b0_tt" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "TC;B0" "gemm_small_kernel_b0_tc" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "CT;B0" "gemm_small_kernel_b0_ct" false "" "" false "COMPLEX")
GenerateNamedObjects("${KERNELDIR}/${CGEMM_SMALL_K_B0_TT}.c" "CC;B0" "gemm_small_kernel_b0_cc" false "" "" false "COMPLEX")
endif ()
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "UNIT" "trsm_iunucopy" false "" ${TSUFFIX} false "COMPLEX")
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_M}.c" "" "trsm_iunncopy" false "" ${TSUFFIX} false "COMPLEX")
GenerateNamedObjects("generic/ztrsm_uncopy_${CGEMM_UNROLL_N}.c" "OUTER;UNIT" "trsm_ounucopy" false "" ${TSUFFIX} false "COMPLEX")

View File

@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
AVX2OPT = -mavx2 -mfma
endif
endif
ifdef NO_AVX2
@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
endif
else ifeq ($(TARGET_CORE), HASWELL)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), ZEN)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
else

View File

@ -207,9 +207,12 @@ ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
SBLASOBJS += \
sgemv_n$(TSUFFIX).$(SUFFIX) sgemv_t$(TSUFFIX).$(SUFFIX)
endif
ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE))" ""
SBLASOBJS += \
ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(BUILD_SINGLE),1)
SBLASOBJS += \
ssymv_U$(TSUFFIX).$(SUFFIX) ssymv_L$(TSUFFIX).$(SUFFIX) \
sger_k$(TSUFFIX).$(SUFFIX)
endif
ifeq ($(BUILD_DOUBLE),1)
@ -359,8 +362,7 @@ $(KDIR)xgemv_d$(TSUFFIX).$(SUFFIX) $(KDIR)xgemv_d$(TSUFFIX).$(PSUFFIX) : $(KERNE
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANS -DCONJ -DXCONJ $< -o $@
ifeq ($(BUILD_SINGLE),1)
ifneq "$(or (BUILD_SINGLE),$(BUILD_DOUBLE))" ""
$(KDIR)ssymv_U$(TSUFFIX).$(SUFFIX) $(KDIR)ssymv_U$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSYMV_U_KERNEL) $(SSYMV_U_PARAM)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -ULOWER $< -o $@

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 40

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
.set noat
.set noreorder

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCH_SIZE 80

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -42,7 +42,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
.globl NAME
.ent NAME

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCH_SIZE 80

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
.set noat
.set noreorder

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define STACKSIZE 64
#define PREFETCHSIZE 32

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define STACKSIZE 64
#define PREFETCHSIZE 32

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -36,7 +36,7 @@
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "version.h"
.set noat
.set noreorder

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define N $16
#define X $17

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88

View File

@ -39,7 +39,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCH_SIZE 80

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#define PREFETCHSIZE 88

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
PROLOGUE
PROFCODE

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."

View File

@ -38,7 +38,7 @@
#define ASSEMBLER
#include "common.h"
#include "version.h"
#if !defined(EV4) && !defined(EV5) && !defined(EV6)
#error "Architecture is not specified."

Some files were not shown because too many files have changed in this diff Show More