Merge branch 'xianyi:develop' into issue4130
This commit is contained in:
commit
42909ce57d
|
@ -31,6 +31,15 @@ task:
|
|||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
- make
|
||||
|
||||
task:
|
||||
name: AppleM1/GCC/MAKE/OPENMP
|
||||
compile_script:
|
||||
- brew install gcc@11
|
||||
- export PATH=/opt/homebrew/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/include"
|
||||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
task:
|
||||
|
|
|
@ -151,40 +151,53 @@ jobs:
|
|||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
msystem: [MINGW64, MINGW32, CLANG64]
|
||||
msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
|
||||
idx: [int32, int64]
|
||||
build-type: [Release]
|
||||
include:
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: MINGW32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-i686
|
||||
fc-pkg: mingw-w64-i686-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: CLANG64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: CLANG32
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-clang-i686
|
||||
fc-pkg: cc
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
- msystem: MINGW64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
- msystem: CLANG64
|
||||
idx: int64
|
||||
idx64-flags: -DBINARY=64 -DINTERFACE64=1
|
||||
target-prefix: mingw-w64-clang-x86_64
|
||||
c-lapack-flags: -DC_LAPACK=ON
|
||||
fc-pkg: fc
|
||||
# Compiling with Flang 16 seems to cause test errors on machines
|
||||
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
|
||||
no-avx512-flags: -DNO_AVX512=1
|
||||
- msystem: MINGW64
|
||||
idx: int32
|
||||
target-prefix: mingw-w64-x86_64
|
||||
fc-pkg: mingw-w64-x86_64-gcc-fortran
|
||||
fc-pkg: fc
|
||||
build-type: None
|
||||
exclude:
|
||||
- msystem: MINGW32
|
||||
idx: int64
|
||||
- msystem: CLANG32
|
||||
idx: int64
|
||||
|
||||
defaults:
|
||||
run:
|
||||
|
@ -209,7 +222,7 @@ jobs:
|
|||
install: >-
|
||||
base-devel
|
||||
${{ matrix.target-prefix }}-cc
|
||||
${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
|
||||
${{ matrix.target-prefix }}-cmake
|
||||
${{ matrix.target-prefix }}-ninja
|
||||
${{ matrix.target-prefix }}-ccache
|
||||
|
@ -261,6 +274,7 @@ jobs:
|
|||
-DTARGET=CORE2 \
|
||||
${{ matrix.idx64-flags }} \
|
||||
${{ matrix.c-lapack-flags }} \
|
||||
${{ matrix.no-avx512-flags }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
|
@ -280,9 +294,22 @@ jobs:
|
|||
key: ${{ steps.ccache-prepare.outputs.key }}
|
||||
|
||||
- name: Run tests
|
||||
id: run-ctest
|
||||
timeout-minutes: 60
|
||||
run: cd build && ctest
|
||||
|
||||
- name: Re-run tests
|
||||
if: always() && (steps.run-ctest.outcome == 'failure')
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
cd build
|
||||
echo "::group::Re-run ctest"
|
||||
ctest --rerun-failed --output-on-failure || true
|
||||
echo "::endgroup::"
|
||||
echo "::group::Log from these tests"
|
||||
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
|
||||
echo "::endgroup::"
|
||||
|
||||
|
||||
cross_build:
|
||||
runs-on: ubuntu-22.04
|
||||
|
|
|
@ -0,0 +1,110 @@
|
|||
name: loongarch64 qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
runs-on: ubuntu-latest
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: LOONGSONGENERIC
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
|
||||
- target: LOONGSON3R5
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON3R5
|
||||
- target: LOONGSON2K1000
|
||||
triple: loongarch64-unknown-linux-gnu
|
||||
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Install APT deps
|
||||
run: |
|
||||
sudo add-apt-repository ppa:savoury1/virtualisation
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
|
||||
qemu-user-static
|
||||
|
||||
- name: Download and install loongarch64-toolchain
|
||||
run: |
|
||||
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
|
||||
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
|
||||
|
||||
- name: Set env
|
||||
run: |
|
||||
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
|
||||
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Disable utest dsdot:dsdot_n_1
|
||||
run: |
|
||||
echo -n > utest/test_dsdot.c
|
||||
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
|
||||
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
|
||||
|
||||
- name: Test
|
||||
run: |
|
||||
qemu-loongarch64-static ./utest/openblas_utest
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT2.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
||||
rm -f ./test/?BLAT3.SUMM
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
|
||||
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
|
|
@ -72,6 +72,7 @@ test/SBLAT3.SUMM
|
|||
test/ZBLAT2.SUMM
|
||||
test/ZBLAT3.SUMM
|
||||
test/SHBLAT3.SUMM
|
||||
test/SBBLAT3.SUMM
|
||||
test/cblat1
|
||||
test/cblat2
|
||||
test/cblat3
|
||||
|
@ -82,6 +83,7 @@ test/sblat1
|
|||
test/sblat2
|
||||
test/sblat3
|
||||
test/test_shgemm
|
||||
test/test_sbgemm
|
||||
test/zblat1
|
||||
test/zblat2
|
||||
test/zblat3
|
||||
|
|
|
@ -7,7 +7,7 @@ pipeline {
|
|||
stages {
|
||||
stage('Build') {
|
||||
steps {
|
||||
sh 'make'
|
||||
sh 'make clean && make'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -9,7 +9,7 @@ pipeline {
|
|||
steps {
|
||||
sh 'sudo apt update'
|
||||
sh 'sudo apt install gfortran -y'
|
||||
sh 'make'
|
||||
sh 'make clean && make'
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
|
|||
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
|
||||
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
|
||||
endif
|
||||
|
||||
#
|
||||
# OS dependent settings
|
||||
#
|
||||
|
@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
|
|||
ifneq ($(NO_SVE), 1)
|
||||
DYNAMIC_CORE += NEOVERSEV1
|
||||
DYNAMIC_CORE += NEOVERSEN2
|
||||
DYNAMIC_CORE += ARMV8SVE
|
||||
endif
|
||||
DYNAMIC_CORE += CORTEXA55
|
||||
DYNAMIC_CORE += FALKOR
|
||||
|
@ -1086,8 +1092,9 @@ endif
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
|
||||
CCOMMON_OPT += -DF_INTERFACE_GFORT
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
FCOMMON_OPT += -Wall
|
||||
# make single-threaded LAPACK calls thread-safe #1847
|
||||
FCOMMON_OPT += -frecursive
|
||||
|
@ -1101,6 +1108,7 @@ EXTRALIB += -lgfortran
|
|||
endif
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifdef NO_BINARY_MODE
|
||||
ifeq ($(ARCH), $(filter $(ARCH),mips64))
|
||||
ifdef BINARY64
|
||||
|
@ -1767,6 +1775,8 @@ export TARGET_CORE
|
|||
export NO_AVX512
|
||||
export NO_AVX2
|
||||
export BUILD_BFLOAT16
|
||||
export NO_LSX
|
||||
export NO_LASX
|
||||
|
||||
export SBGEMM_UNROLL_M
|
||||
export SBGEMM_UNROLL_N
|
||||
|
|
|
@ -87,6 +87,19 @@ ifneq ($(F_COMPILER), NAG)
|
|||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 9
|
||||
ifeq ($(CLANGVERSIONGTEQ9), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
@ -116,6 +129,19 @@ ifneq ($(F_COMPILER), NAG)
|
|||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
else ifeq ($(C_COMPILER), CLANG)
|
||||
# cooperlake support was added in clang 12
|
||||
ifeq ($(CLANGVERSIONGTEQ12), 1)
|
||||
CCOMMON_OPT += -march=cooperlake
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=cooperlake
|
||||
endif
|
||||
else # not supported in clang, fallback to avx512
|
||||
CCOMMON_OPT += -march=skylake-avx512
|
||||
ifneq ($(F_COMPILER), NAG)
|
||||
FCOMMON_OPT += -march=skylake-avx512
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
CCOMMON_OPT += -fno-asynchronous-unwind-tables
|
||||
|
|
|
@ -271,6 +271,19 @@ jobs:
|
|||
- script: |
|
||||
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: OSX_xbuild_DYNAMIC_ARM64
|
||||
pool:
|
||||
vmImage: 'macOS-11'
|
||||
variables:
|
||||
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
|
||||
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
|
||||
steps:
|
||||
- script: |
|
||||
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
|
||||
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
|
||||
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
|
||||
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
|
||||
|
||||
- job: ALPINE_MUSL
|
||||
pool:
|
||||
vmImage: 'ubuntu-latest'
|
||||
|
|
38
c_check
38
c_check
|
@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
|
|||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
no_lsx=0
|
||||
no_lasx=0
|
||||
if [ "$architecture" = "loongarch64" ]; then
|
||||
tmpd="$(mktemp -d)"
|
||||
tmplsx="$tmpd/lsx.c"
|
||||
codelsx='"vadd.b $vr0, $vr0, $vr0"'
|
||||
lsx_flags='-march=loongarch64 -mlsx'
|
||||
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
|
||||
args="$lsx_flags -o $tmplsx.o $tmplsx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lsx=1
|
||||
}
|
||||
|
||||
tmplasx="$tmpd/lasx.c"
|
||||
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
|
||||
lasx_flags='-march=loongarch64 -mlasx'
|
||||
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
|
||||
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
|
||||
args="$lasx_flags -o $tmplasx.o $tmplasx"
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_lasx=1
|
||||
}
|
||||
|
||||
rm -rf "$tmpd"
|
||||
fi
|
||||
|
||||
case "$data" in
|
||||
*ARCH_X86_64*) architecture=x86_64 ;;
|
||||
*ARCH_X86*) architecture=x86 ;;
|
||||
|
@ -252,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then
|
|||
no_sve=0
|
||||
{
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
|
||||
$compiler_name $flags $args >/dev/null 2>&1
|
||||
} || {
|
||||
no_sve=1
|
||||
}
|
||||
|
@ -399,6 +433,8 @@ done
|
|||
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
|
||||
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
|
||||
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
|
||||
} >> "$makefile"
|
||||
|
||||
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
|
||||
|
@ -414,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
|
|||
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
|
||||
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
|
||||
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
|
||||
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
|
||||
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
|
||||
} >> "$config"
|
||||
|
||||
|
||||
|
|
45
c_check.pl
45
c_check.pl
|
@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
|
|||
}
|
||||
}
|
||||
|
||||
$no_lsx = 0;
|
||||
$no_lasx = 0;
|
||||
if (($architecture eq "loongarch64")) {
|
||||
eval "use File::Temp qw(tempfile)";
|
||||
if ($@){
|
||||
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
|
||||
} else {
|
||||
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
|
||||
$lsx_flags = "-march=loongarch64 -mlsx";
|
||||
print $tmplsx "#include <lsxintrin.h>\n\n";
|
||||
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
|
||||
|
||||
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lsx = 1;
|
||||
} else {
|
||||
$no_lsx = 0;
|
||||
}
|
||||
unlink("$tmplsx.o");
|
||||
|
||||
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
|
||||
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
|
||||
$lasx_flags = "-march=loongarch64 -mlasx";
|
||||
print $tmplasx "#include <lasxintrin.h>\n\n";
|
||||
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
|
||||
|
||||
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
|
||||
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
|
||||
system(@cmd) == 0;
|
||||
if ($? != 0) {
|
||||
$no_lasx = 1;
|
||||
} else {
|
||||
$no_lasx = 0;
|
||||
}
|
||||
unlink("$tmplasx.o");
|
||||
}
|
||||
}
|
||||
|
||||
$architecture = x86 if ($data =~ /ARCH_X86/);
|
||||
$architecture = x86_64 if ($data =~ /ARCH_X86_64/);
|
||||
$architecture = e2k if ($data =~ /ARCH_E2K/);
|
||||
|
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
|
|||
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
|
||||
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
|
||||
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
|
||||
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
|
||||
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
|
||||
|
||||
$os =~ tr/[a-z]/[A-Z]/;
|
||||
$architecture =~ tr/[a-z]/[A-Z]/;
|
||||
|
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
|
|||
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
|
||||
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
|
||||
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
|
||||
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
|
||||
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
|
||||
|
||||
|
||||
if ($os eq "LINUX") {
|
||||
|
|
|
@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
|
|||
if (ARM64)
|
||||
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
|
||||
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2)
|
||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
|
||||
endif ()
|
||||
if (DYNAMIC_LIST)
|
||||
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
|
||||
|
@ -135,7 +135,7 @@ if (ARM64)
|
|||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
||||
if (${ARCH} STREQUAL "riscv64")
|
||||
if (RISCV64)
|
||||
set(NO_BINARY_MODE 1)
|
||||
set(BINARY_DEFINED 1)
|
||||
endif ()
|
||||
|
|
|
@ -180,6 +180,9 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL NEOVERSEN2)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
|
@ -188,9 +191,13 @@ if (${CORE} STREQUAL NEOVERSEN2)
|
|||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEV1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
|
@ -199,6 +206,7 @@ if (${CORE} STREQUAL NEOVERSEV1)
|
|||
endif()
|
||||
endif()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL NEOVERSEN1)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
|
@ -213,9 +221,13 @@ endif ()
|
|||
|
||||
if (${CORE} STREQUAL ARMV8SVE)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
|
||||
else ()
|
||||
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${CORE} STREQUAL CORTEXA510)
|
||||
if (NOT DYNAMIC_ARCH)
|
||||
|
|
|
@ -3,7 +3,8 @@
|
|||
## Description: Ported from portion of OpenBLAS/Makefile.system
|
||||
## Sets Fortran related variables.
|
||||
|
||||
if (${F_COMPILER} STREQUAL "FLANG")
|
||||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
|
@ -38,16 +39,18 @@ if (${F_COMPILER} STREQUAL "G95")
|
|||
endif ()
|
||||
endif ()
|
||||
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
|
||||
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
|
||||
if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# ensure reentrancy of lapack codes
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
|
||||
# work around ABI violation in passing string arguments from C
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
|
||||
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
if (NOT NO_LAPACK)
|
||||
# Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
|
||||
set(EXTRALIB "${EXTRALIB} -lgfortran")
|
||||
endif ()
|
||||
endif ()
|
||||
if (NO_BINARY_MODE)
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
|
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
|
|||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
|
||||
endif ()
|
||||
endif ()
|
||||
if (RISCV64)
|
||||
if (BINARY64)
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
|
|
|
@ -282,6 +282,9 @@ if (DEFINED TARGET)
|
|||
endif()
|
||||
|
||||
if (${TARGET} STREQUAL NEOVERSEV1)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
|
||||
|
@ -289,7 +292,11 @@ if (${TARGET} STREQUAL NEOVERSEV1)
|
|||
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL NEOVERSEN2)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
else ()
|
||||
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
|
||||
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
|
||||
|
@ -297,9 +304,14 @@ if (${TARGET} STREQUAL NEOVERSEV1)
|
|||
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (${TARGET} STREQUAL ARMV8SVE)
|
||||
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
|
||||
else ()
|
||||
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
endif()
|
||||
|
||||
|
|
|
@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
|
|||
set(MIPS64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
|
||||
set(LOONGARCH64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
|
||||
set(RISCV64 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
||||
if (NOT BINARY)
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
|
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
|
|||
endif()
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
|
||||
set(X86 1)
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)")
|
||||
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
|
||||
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
|
||||
set(ARM64 1)
|
||||
else()
|
||||
|
@ -107,7 +109,7 @@ else()
|
|||
endif ()
|
||||
|
||||
if (NOT BINARY)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64)
|
||||
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
|
||||
set(BINARY 64)
|
||||
else ()
|
||||
set(BINARY 32)
|
||||
|
|
|
@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
|
|||
/* Global Parameter */
|
||||
extern int blas_cpu_number;
|
||||
extern int blas_num_threads;
|
||||
extern int blas_num_threads_set;
|
||||
extern int blas_omp_linked;
|
||||
|
||||
#define BLAS_LEGACY 0x8000U
|
||||
|
@ -136,15 +135,13 @@ typedef struct blas_queue {
|
|||
#ifdef SMP_SERVER
|
||||
|
||||
extern int blas_server_avail;
|
||||
extern int blas_omp_number_max;
|
||||
|
||||
static __inline int num_cpu_avail(int level) {
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
int openmp_nthreads;
|
||||
if (blas_num_threads_set == 0)
|
||||
openmp_nthreads=omp_get_max_threads();
|
||||
else
|
||||
openmp_nthreads=blas_cpu_number;
|
||||
#endif
|
||||
|
||||
#ifndef USE_OPENMP
|
||||
|
@ -156,6 +153,12 @@ int openmp_nthreads;
|
|||
) return 1;
|
||||
|
||||
#ifdef USE_OPENMP
|
||||
if (openmp_nthreads > blas_omp_number_max){
|
||||
#ifdef DEBUG
|
||||
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
|
||||
#endif
|
||||
openmp_nthreads = blas_omp_number_max;
|
||||
}
|
||||
if (blas_cpu_number != openmp_nthreads) {
|
||||
goto_set_num_threads(openmp_nthreads);
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <sys/auxv.h>
|
||||
|
||||
/* If LASX extension instructions supported,
|
||||
* using core LOONGSON3R5
|
||||
|
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CPU_LOONGSON3R5 1
|
||||
#define CPU_LOONGSON2K1000 2
|
||||
|
||||
#define LOONGARCH_CFG2 0x02
|
||||
#define LOONGARCH_LASX 1<<7
|
||||
#define LOONGARCH_LSX 1<<6
|
||||
#define LA_HWCAP_LSX (1<<4)
|
||||
#define LA_HWCAP_LASX (1<<5)
|
||||
|
||||
static char *cpuname[] = {
|
||||
"LOONGSONGENERIC",
|
||||
|
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
|
|||
|
||||
int detect(void) {
|
||||
#ifdef __linux
|
||||
uint32_t reg = 0;
|
||||
int flag = (int)getauxval(AT_HWCAP);
|
||||
|
||||
__asm__ volatile (
|
||||
"cpucfg %0, %1 \n\t"
|
||||
: "+&r"(reg)
|
||||
: "r"(LOONGARCH_CFG2)
|
||||
);
|
||||
|
||||
if (reg & LOONGARCH_LASX)
|
||||
if (flag & LA_HWCAP_LASX)
|
||||
return CPU_LOONGSON3R5;
|
||||
else if (reg & LOONGARCH_LSX)
|
||||
else if (flag & LA_HWCAP_LSX)
|
||||
return CPU_LOONGSON2K1000;
|
||||
else
|
||||
return CPU_GENERIC;
|
||||
|
|
|
@ -1551,6 +1551,7 @@ int get_cpuname(void){
|
|||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
if(support_avx2())
|
||||
return CPUTYPE_HASWELL;
|
||||
if(support_avx())
|
||||
|
@ -2360,6 +2361,7 @@ int get_coretype(void){
|
|||
case 7: // Raptor Lake
|
||||
case 10:
|
||||
case 15:
|
||||
case 14: // Alder Lake N
|
||||
#ifndef NO_AVX2
|
||||
if(support_avx2())
|
||||
return CORE_HASWELL;
|
||||
|
|
|
@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
|
|||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(F_COMPILER), GFORTRAN)
|
||||
ifeq ($(C_COMPILER), CLANG)
|
||||
CEXTRALIB = -lomp
|
||||
CEXTRALIB += -lomp
|
||||
endif
|
||||
endif
|
||||
ifeq ($(F_COMPILER), NAG)
|
||||
|
|
|
@ -0,0 +1,270 @@
|
|||
# Guidance for redistributing OpenBLAS
|
||||
|
||||
*We note that this document contains recommendations only - packagers and other
|
||||
redistributors are in charge of how OpenBLAS is built and distributed in their
|
||||
systems, and may have good reasons to deviate from the guidance given on this
|
||||
page. These recommendations are aimed at general packaging systems, with a user
|
||||
base that typically is large, open source (or freely available at least), and
|
||||
doesn't behave uniformly or that the packager is directly connected with.*
|
||||
|
||||
OpenBLAS has a large number of build-time options which can be used to change
|
||||
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
|
||||
in build configuration can be necessary to acheive a given end goal within a
|
||||
distribution or as an end user. However, such variation can also make it more
|
||||
difficult to build on top of OpenBLAS and ship code or other packages in a way
|
||||
that works across many different distros. Here we provide guidance about the
|
||||
most important build options, what effects they may have when changed, and
|
||||
which ones to default to.
|
||||
|
||||
The Make and CMake build systems provide equivalent options and yield more or
|
||||
less the same artifacts, but not exactly (the CMake builds are still
|
||||
experimental). You can choose either one and the options will function in the
|
||||
same way, however the CMake outputs may require some renaming. To review
|
||||
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
|
||||
the repository.
|
||||
|
||||
Build options typically fall into two categories: (a) options that affect the
|
||||
user interface, such as library and symbol names or APIs that are made
|
||||
available, and (b) options that affect performance and runtime behavior, such
|
||||
as threading behavior or CPU architecture-specific code paths. The user
|
||||
interface options are more important to keep aligned between distributions,
|
||||
while for the performance-related options there are typically more reasons to
|
||||
make choices that deviate from the defaults.
|
||||
|
||||
Here are recommendations for user interface related packaging choices where it
|
||||
is not likely to be a good idea to deviate (typically these are the default
|
||||
settings):
|
||||
|
||||
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
|
||||
binary size much, so don't turn it off.
|
||||
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
|
||||
while it does make up a significant part of the binary size of the installed
|
||||
library, that does not outweigh the regression in usability when deviating
|
||||
from the default here.[^1]
|
||||
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
|
||||
detection files. These files are used by build systems when users want to
|
||||
link against OpenBLAS, and there is no benefit of leaving them out.
|
||||
4. Provide the LP64 interface by default, and if in addition to that you choose
|
||||
to provide an ILP64 interface build as well, use a symbol suffix to avoid
|
||||
symbol name clashes (see the next section).
|
||||
|
||||
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
|
||||
know. Older versions of Arch Linux did not, and that was known to cause
|
||||
problems.
|
||||
|
||||
|
||||
## ILP64 interface builds
|
||||
|
||||
The LP64 (32-bit integer) interface is the default build, and has
|
||||
well-established C and Fortran APIs as determined by the reference (Netlib)
|
||||
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
|
||||
not have a standard API: symbol names and shared/static library names can be
|
||||
produced in multiple ways, and this tends to make it difficult to use.
|
||||
As of today there is an agreed-upon way of choosing names for OpenBLAS between
|
||||
a number of key users/redistributors, which is the closest thing to a standard
|
||||
that there is now. However, there is an ongoing standardization effort in the
|
||||
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
|
||||
agreed-upon convention. In this section we'll aim to explain both.
|
||||
|
||||
Those two methods are fairly similar, and have a key thing in common: *using a
|
||||
symbol suffix*. This is good practice; it is recommended that if you distribute
|
||||
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
|
||||
This avoids potential symbol clashes when different packages which depend on
|
||||
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
|
||||
|
||||
### The current OpenBLAS agreed-upon ILP64 convention
|
||||
|
||||
This convention comprises the shared library name and the symbol suffix in the
|
||||
shared library. The symbol suffix to use is `64_`, implying that the library
|
||||
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
|
||||
The central issue where this was discussed is
|
||||
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
|
||||
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
|
||||
|
||||
To build shared and static libraries with the currently recommended ILP64
|
||||
conventions with Make:
|
||||
```bash
|
||||
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||
```
|
||||
|
||||
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
|
||||
named `openblas64.pc`, and CMake and header files.
|
||||
|
||||
Installing locally and inspecting the output will show a few more details:
|
||||
```bash
|
||||
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
|
||||
$ tree . # output slightly edited down
|
||||
.
|
||||
├── include
|
||||
│ ├── cblas.h
|
||||
│ ├── f77blas.h
|
||||
│ ├── lapacke_config.h
|
||||
│ ├── lapacke.h
|
||||
│ ├── lapacke_mangling.h
|
||||
│ ├── lapacke_utils.h
|
||||
│ ├── lapack.h
|
||||
│ └── openblas_config.h
|
||||
└── lib
|
||||
├── cmake
|
||||
│ └── openblas
|
||||
│ ├── OpenBLASConfig.cmake
|
||||
│ └── OpenBLASConfigVersion.cmake
|
||||
├── libopenblas64_.a
|
||||
├── libopenblas64_.so
|
||||
└── pkgconfig
|
||||
└── openblas64.pc
|
||||
```
|
||||
|
||||
A key point are the symbol names. These will equal the LP64 symbol names, then
|
||||
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
|
||||
Hence to obtain the final symbol names, we need to take into account which
|
||||
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
|
||||
Fortran, or Flang), that means appending a single underscore. In that case, the
|
||||
result is:
|
||||
|
||||
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||
|---------------|--------------------|------------------------|-----------------------|
|
||||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
|
||||
|
||||
It is quite useful to have these symbol names be as uniform as possible across
|
||||
different packaging systems.
|
||||
|
||||
The equivalent build options with CMake are:
|
||||
```bash
|
||||
$ mkdir build && cd build
|
||||
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
|
||||
$ cmake --build . -j
|
||||
```
|
||||
|
||||
Note that the result is not 100% identical to the Make result. For example, the
|
||||
library name ends in `_64` rather than `64_` - it is recommended to rename them
|
||||
to match the Make library names (also update the `libsuffix` entry in
|
||||
`openblas64.pc` to match that rename).
|
||||
```bash
|
||||
$ cmake --install . --prefix $PWD/../../openblas/cmake64
|
||||
$ tree .
|
||||
.
|
||||
├── include
|
||||
│ └── openblas64
|
||||
│ ├── cblas.h
|
||||
│ ├── f77blas.h
|
||||
│ ├── lapacke_config.h
|
||||
│ ├── lapacke_example_aux.h
|
||||
│ ├── lapacke.h
|
||||
│ ├── lapacke_mangling.h
|
||||
│ ├── lapacke_utils.h
|
||||
│ ├── lapack.h
|
||||
│ ├── openblas64
|
||||
│ │ └── lapacke_mangling.h
|
||||
│ └── openblas_config.h
|
||||
└── lib
|
||||
├── cmake
|
||||
│ └── OpenBLAS64
|
||||
│ ├── OpenBLAS64Config.cmake
|
||||
│ ├── OpenBLAS64ConfigVersion.cmake
|
||||
│ ├── OpenBLAS64Targets.cmake
|
||||
│ └── OpenBLAS64Targets-noconfig.cmake
|
||||
├── libopenblas_64.a
|
||||
├── libopenblas_64.so -> libopenblas_64.so.0
|
||||
└── pkgconfig
|
||||
└── openblas64.pc
|
||||
```
|
||||
|
||||
|
||||
### The upcoming standardized ILP64 convention
|
||||
|
||||
While the `64_` convention above got some adoption, it's slightly hacky and is
|
||||
implemented through the use of `objcopy`. An effort is ongoing for a more
|
||||
broadly adopted convention in the reference BLAS and LAPACK libraries, using
|
||||
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
|
||||
Fortran compiler mangling. The central issue for this is
|
||||
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
|
||||
|
||||
For the most common cases of compiler mangling (a single `_` appended), the end
|
||||
result will be:
|
||||
|
||||
| base API name | binary symbol name | call from Fortran code | call from C code |
|
||||
|---------------|--------------------|------------------------|-----------------------|
|
||||
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
|
||||
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
|
||||
|
||||
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
|
||||
|
||||
The shared library name for this `_64` convention should be `libopenblas_64.so`.
|
||||
|
||||
Note: it is not yet possible to produce an OpenBLAS build which employs this
|
||||
convention! Once reference BLAS and LAPACK with support for `_64` have been
|
||||
released, a future OpenBLAS release will support it. For now, please use the
|
||||
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
|
||||
considered reserved for future use of the `_64` standard as prescribed by
|
||||
reference BLAS/LAPACK.
|
||||
|
||||
|
||||
## Performance and runtime behavior related build options
|
||||
|
||||
For these options there are multiple reasonable or common choices.
|
||||
|
||||
### Threading related options
|
||||
|
||||
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
|
||||
default being multi-threaded. It's expected that the default `libopenblas`
|
||||
library is multi-threaded; if you'd like to also distribute single-threaded
|
||||
builds, consider naming them `libopenblas_sequential`.
|
||||
|
||||
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
|
||||
default being pthreads. Both options are commonly used, and the choice here
|
||||
should not influence the shared library name. The choice will be captured by
|
||||
the `.pc` file. E.g.,:
|
||||
```bash
|
||||
$ pkg-config --libs openblas
|
||||
-fopenmp -lopenblas
|
||||
|
||||
$ cat openblas.pc
|
||||
...
|
||||
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
|
||||
```
|
||||
|
||||
The maximum number of threads users will be able to use is determined at build
|
||||
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
|
||||
range of values that are reasonable to use (up to 256). 64 is a typical choice
|
||||
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
|
||||
Please see `Makefile.rule` for more details.
|
||||
|
||||
### CPU architecture related options
|
||||
|
||||
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
|
||||
distributing to a user base with a variety of hardware, it is recommended to
|
||||
enable CPU architecture runtime detection. This will dynamically select
|
||||
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
|
||||
build option. This is usually done on all common CPU families, except when
|
||||
there are known issues.
|
||||
|
||||
In case the CPU architecture is known (e.g. you're building binaries for macOS
|
||||
M1 users), it is possible to specify the target architecture directly with the
|
||||
`TARGET=` build option.
|
||||
|
||||
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
|
||||
in this repository.
|
||||
|
||||
|
||||
## Real-world examples
|
||||
|
||||
OpenBLAS is likely to be distributed in one of these distribution models:
|
||||
|
||||
1. As a standalone package, or multiple packages, in a packaging ecosystem like
|
||||
a Linux distro, Homebrew, conda-forge or MSYS2.
|
||||
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
|
||||
3. Locally, e.g. making available as a build on a single HPC cluster.
|
||||
|
||||
The guidance on this page is most important for models (1) and (2). These links
|
||||
to build recipes for a representative selection of packaging systems may be
|
||||
helpful as a reference:
|
||||
|
||||
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
|
||||
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
|
||||
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
|
||||
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
|
||||
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
|
||||
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
|
||||
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)
|
|
@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
increased_threads = 1;
|
||||
|
||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
|
||||
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
|
||||
thread_status[i].status = THREAD_STATUS_WAKEUP;
|
||||
|
|
|
@ -68,6 +68,7 @@
|
|||
#endif
|
||||
|
||||
int blas_server_avail = 0;
|
||||
int blas_omp_number_max = 0;
|
||||
|
||||
extern int openblas_omp_adaptive_env();
|
||||
|
||||
|
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
|
|||
|
||||
void goto_set_num_threads(int num_threads) {
|
||||
|
||||
blas_num_threads_set = 1;
|
||||
if (num_threads < 0) blas_num_threads_set = 0;
|
||||
if (num_threads < 1) num_threads = blas_num_threads;
|
||||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
|
|||
}
|
||||
|
||||
int blas_thread_init(void){
|
||||
if(blas_omp_number_max <= 0)
|
||||
blas_omp_number_max = omp_get_max_threads();
|
||||
|
||||
blas_get_cpu_number();
|
||||
|
||||
|
|
|
@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
|
|||
blas_server_avail = 1;
|
||||
}
|
||||
|
||||
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
|
|||
#else
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_ARMV8SVE
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
#ifdef DYN_CORTEX_A55
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#else
|
||||
|
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
|
|||
#ifndef NO_SVE
|
||||
extern gotoblas_t gotoblas_NEOVERSEV1;
|
||||
extern gotoblas_t gotoblas_NEOVERSEN2;
|
||||
extern gotoblas_t gotoblas_ARMV8SVE;
|
||||
#else
|
||||
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8
|
||||
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8
|
||||
#define gotoblas_ARMV8SVE gotoblas_ARMV8
|
||||
#endif
|
||||
extern gotoblas_t gotoblas_THUNDERX3T110;
|
||||
extern gotoblas_t gotoblas_CORTEXA55;
|
||||
#endif
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
#define FALLBACK_VERBOSE 1
|
||||
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
|
||||
|
||||
#define NUM_CORETYPES 13
|
||||
#define NUM_CORETYPES 16
|
||||
|
||||
/*
|
||||
* In case asm/hwcap.h is outdated on the build system, make sure
|
||||
|
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
|
|||
#ifndef HWCAP_CPUID
|
||||
#define HWCAP_CPUID (1 << 11)
|
||||
#endif
|
||||
#ifndef HWCAP_SVE
|
||||
#define HWCAP_SVE (1 << 22)
|
||||
#endif
|
||||
|
||||
#define get_cpu_ftr(id, var) ({ \
|
||||
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
|
||||
|
@ -168,6 +181,7 @@ static char *corename[] = {
|
|||
"neoversen2",
|
||||
"thunderx3t110",
|
||||
"cortexa55",
|
||||
"armv8sve",
|
||||
"unknown"
|
||||
};
|
||||
|
||||
|
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
|
|||
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
|
||||
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
|
||||
if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
|
||||
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
|
||||
return corename[NUM_CORETYPES];
|
||||
}
|
||||
|
||||
|
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
|
|||
case 12: return (&gotoblas_NEOVERSEN2);
|
||||
case 13: return (&gotoblas_THUNDERX3T110);
|
||||
case 14: return (&gotoblas_CORTEXA55);
|
||||
case 15: return (&gotoblas_ARMV8SVE);
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
openblas_warning(1, message);
|
||||
|
@ -281,8 +297,16 @@ static gotoblas_t *get_coretype(void) {
|
|||
return &gotoblas_NEOVERSEN1;
|
||||
#ifndef NO_SVE
|
||||
case 0xd49:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
} else
|
||||
return &gotoblas_NEOVERSEN2;
|
||||
case 0xd40:
|
||||
if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
|
||||
return &gotoblas_NEOVERSEN1;
|
||||
}else
|
||||
return &gotoblas_NEOVERSEV1;
|
||||
#endif
|
||||
case 0xd05: // Cortex A55
|
||||
|
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
|
|||
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
|
||||
openblas_warning(1, coremsg);
|
||||
}
|
||||
#ifndef NO_SVE
|
||||
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
|
||||
return &gotoblas_ARMV8SVE;
|
||||
}
|
||||
#endif
|
||||
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
|||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
|
|||
*/
|
||||
int blas_num_threads = 0;
|
||||
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
}
|
||||
|
|
|
@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
|
|||
This value is equal or large than blas_cpu_number. This means some threads are sleep.
|
||||
*/
|
||||
int blas_num_threads = 0;
|
||||
int blas_num_threads_set = 0;
|
||||
|
||||
int goto_get_num_procs (void) {
|
||||
return blas_cpu_number;
|
||||
|
|
7
f_check
7
f_check
|
@ -101,6 +101,13 @@ else
|
|||
*flang*)
|
||||
vendor=FLANG
|
||||
openmp='-fopenmp'
|
||||
data=`$compiler -v 2>&1 > /dev/null `
|
||||
v="${data#*version *}"
|
||||
v="${v%%*.}"
|
||||
major="${v%%.*}"
|
||||
if [ "$major" -ge 17 ]; then
|
||||
vendor=FLANGNEW
|
||||
fi
|
||||
;;
|
||||
*ifort*|*ifx*)
|
||||
vendor=INTEL
|
||||
|
|
|
@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
|
|||
info = 0;
|
||||
|
||||
|
||||
if (lda < MAX(1, m)) info = 6;
|
||||
if (lda < MAX(1, m)) info = 5;
|
||||
if (ldc < MAX(1, m)) info = 8;
|
||||
|
||||
if (n < 0) info = 2;
|
||||
|
|
|
@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
|
|||
|
||||
if (n <= 0) return 0.;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (n == 1)
|
||||
#ifdef DOUBLE
|
||||
return fabs(x[0]);
|
||||
#else
|
||||
return fabsf(x[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (incx < 0)
|
||||
#ifdef COMPLEX
|
||||
x -= (n - 1) * incx * 2;
|
||||
#else
|
||||
x -= (n - 1) * incx;
|
||||
#endif
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
|
|||
|
||||
if (n <= 0) return 0.;
|
||||
|
||||
#ifndef COMPLEX
|
||||
if (n == 1)
|
||||
#ifdef DOUBLE
|
||||
return fabs(x[0]);
|
||||
#else
|
||||
return fabsf(x[0]);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
if (incx < 0)
|
||||
#ifdef COMPLEX
|
||||
x -= (n - 1) * incx * 2;
|
||||
#else
|
||||
x -= (n - 1) * incx;
|
||||
#endif
|
||||
|
||||
IDEBUG_START;
|
||||
|
||||
FUNCTION_PROFILE_START();
|
||||
|
|
|
@ -33,7 +33,7 @@ endif
|
|||
ifdef TARGET_CORE
|
||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
|
||||
override CFLAGS += -march=sapphirerapids
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
|
@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
|||
endif
|
||||
else ifeq ($(TARGET_CORE), COOPERLAKE)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
||||
ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
|
||||
override CFLAGS += -march=cooperlake
|
||||
else
|
||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||
|
@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
|
|||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
|
||||
else ifeq ($(TARGET_CORE), LOONGSON3R4)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
|
||||
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
else
|
||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||
endif
|
||||
|
|
|
@ -35,6 +35,12 @@ USE_TRMM = 1
|
|||
endif
|
||||
endif
|
||||
|
||||
ifneq ($(DYNAMIC_ARCH), 1)
|
||||
ifeq ($(TARGET), MIPS64_GENERIC)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), HASWELL)
|
||||
USE_TRMM = 1
|
||||
endif
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT absxi = 0.0;
|
||||
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
if ( n == 1 ) return( ABS(x[0]) );
|
||||
|
||||
n *= inc_x;
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG inc_x2;
|
||||
FLOAT temp;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
|
|||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
|
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
|
|||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
ISAMAXKERNEL = iamax.S
|
||||
IDAMAXKERNEL = iamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = nrm2.S
|
||||
DNRM2KERNEL = nrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
DDOTKERNEL = dot.S
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
SDOTKERNEL = ../generic/dot.c
|
||||
else
|
||||
SDOTKERNEL = dot.S
|
||||
endif
|
||||
ifneq ($(C_COMPILER), PGI)
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
else
|
||||
CDOTKERNEL = ../arm/zdot.c
|
||||
ZDOTKERNEL = ../arm/zdot.c
|
||||
endif
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
DSDOTKERNEL = dot.S
|
||||
|
||||
DGEMM_BETA = dgemm_beta.S
|
||||
|
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
|
|||
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
CGEMMINCOPY = cgemm_ncopy_sve_v1.c
|
||||
CGEMMITCOPY = cgemm_tcopy_sve_v1.c
|
||||
CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
|
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
|
|||
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
|
||||
|
||||
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c
|
||||
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c
|
||||
ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
|
||||
|
|
|
@ -1,98 +1 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8SVE
|
||||
|
||||
DAXPYKERNEL = daxpy_thunderx2t99.S
|
||||
|
||||
SASUMKERNEL = sasum_thunderx2t99.c
|
||||
DASUMKERNEL = dasum_thunderx2t99.c
|
||||
CASUMKERNEL = casum_thunderx2t99.c
|
||||
ZASUMKERNEL = zasum_thunderx2t99.c
|
||||
|
||||
SCOPYKERNEL = copy_thunderx2t99.c
|
||||
DCOPYKERNEL = copy_thunderx2t99.c
|
||||
CCOPYKERNEL = copy_thunderx2t99.c
|
||||
ZCOPYKERNEL = copy_thunderx2t99.c
|
||||
|
||||
SSWAPKERNEL = swap_thunderx2t99.S
|
||||
DSWAPKERNEL = swap_thunderx2t99.S
|
||||
CSWAPKERNEL = swap_thunderx2t99.S
|
||||
ZSWAPKERNEL = swap_thunderx2t99.S
|
||||
|
||||
ISAMAXKERNEL = iamax_thunderx2t99.c
|
||||
IDAMAXKERNEL = iamax_thunderx2t99.c
|
||||
ICAMAXKERNEL = izamax_thunderx2t99.c
|
||||
IZAMAXKERNEL = izamax_thunderx2t99.c
|
||||
|
||||
SNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
DNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
CNRM2KERNEL = scnrm2_thunderx2t99.c
|
||||
ZNRM2KERNEL = dznrm2_thunderx2t99.c
|
||||
|
||||
DDOTKERNEL = dot.c
|
||||
SDOTKERNEL = dot.c
|
||||
CDOTKERNEL = zdot_thunderx2t99.c
|
||||
ZDOTKERNEL = zdot_thunderx2t99.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRMMUNCOPY_M =
|
||||
CTRMMLNCOPY_M =
|
||||
CTRMMUTCOPY_M =
|
||||
CTRMMLTCOPY_M =
|
||||
CHEMMLTCOPY_M =
|
||||
CHEMMUTCOPY_M =
|
||||
CSYMMUCOPY_M =
|
||||
CSYMMLCOPY_M =
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
|
||||
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
CGEMMINCOPYOBJ =
|
||||
CGEMMITCOPYOBJ =
|
||||
endif
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMCOPYLN_M =
|
||||
ZTRSMCOPYLT_M =
|
||||
ZTRSMCOPYUN_M =
|
||||
ZTRSMCOPYUT_M =
|
||||
|
||||
ZTRMMUNCOPY_M =
|
||||
ZTRMMLNCOPY_M =
|
||||
ZTRMMUTCOPY_M =
|
||||
ZTRMMLTCOPY_M =
|
||||
ZHEMMLTCOPY_M =
|
||||
ZHEMMUTCOPY_M =
|
||||
ZSYMMUCOPY_M =
|
||||
ZSYMMLCOPY_M =
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
|
||||
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
|
||||
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
|
||||
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
else
|
||||
ZGEMMINCOPYOBJ =
|
||||
ZGEMMITCOPYOBJ =
|
||||
endif
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
|
|
@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M1
|
||||
|
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
add pB, pB, 32
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M2
|
||||
|
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
ld1rw z15.s, p0/z, [pB, 28]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
add pB, pB, 32
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_E
|
||||
|
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ii z22.s, p1/m, z3.s, z15.s
|
||||
OP_ri z23.s, p1/m, z2.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
|
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ii z22.s, p1/m, z1.s, z15.s
|
||||
OP_ri z23.s, p1/m, z0.s, z15.s
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z28.s, z29.s}, p1/z, [pCRow2]
|
||||
fmla z28.s, p1/m, z20.s, alphaz_R
|
||||
fmls z28.s, p1/m, z21.s, alphaz_I
|
||||
|
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z31.s, p1/m, z23.s, alphaz_R
|
||||
st2w {z30.s, z31.s}, p1, [pCRow3]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2w {z26.s, z27.s}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #3
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2w {z24.s, z25.s}, p1/z, [pCRow0]
|
||||
fmla z24.s, p1/m, z16.s, alphaz_R
|
||||
fmls z24.s, p1/m, z17.s, alphaz_I
|
||||
|
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, s0
|
||||
dup alphaz_R, alphaR
|
||||
fmov alphaI, s1
|
||||
|
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
bne .Lcgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Lcgemm_kernel_L4_Mv1_END:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
|
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * lda * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, n);
|
||||
svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
uint32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
do {
|
||||
|
||||
|
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * 2;
|
||||
|
||||
j += svcntw();
|
||||
pg = svwhilelt_b32(j, n);
|
||||
pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
|
|
@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
|
|||
BLASLONG sve_width = SVE_WIDTH;
|
||||
|
||||
for (BLASLONG i = 0; i < n; i += sve_width * 2) {
|
||||
svbool_t pg_a = SVE_WHILELT(i, n);
|
||||
svbool_t pg_b = SVE_WHILELT(i + sve_width, n);
|
||||
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
|
||||
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
|
||||
|
||||
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
|
||||
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
|
||||
|
|
|
@ -0,0 +1,121 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64_t
|
||||
#define SV_INDEX svuint64_t
|
||||
#define SV_INDEXER svindex_u64
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32_t
|
||||
#define SV_INDEX svuint32_t
|
||||
#define SV_INDEXER svindex_u32
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
|
||||
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
|
||||
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
|
||||
a_offset_inner += 2; \
|
||||
b_offset += active * 2;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
||||
uint64_t sve_size;
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
|
||||
SV_TYPE a_vec_real;
|
||||
SV_TYPE a_vec_imag;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size * lda * 2;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
|
|||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE(0L, remaining_n);
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
|
|
|
@ -0,0 +1,115 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdio.h>
|
||||
#include <arm_sve.h>
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define COUNT "cntd"
|
||||
#define SV_TYPE svfloat64x2_t
|
||||
#define SV_TRUE svptrue_b64
|
||||
#define SV_WHILE svwhilelt_b64
|
||||
#else
|
||||
#define COUNT "cntw"
|
||||
#define SV_TYPE svfloat32x2_t
|
||||
#define SV_TRUE svptrue_b32
|
||||
#define SV_WHILE svwhilelt_b32
|
||||
#endif
|
||||
|
||||
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
|
||||
a_vec = svld2(pg, a_offset_inner); \
|
||||
svst2(pg, b_offset, a_vec); \
|
||||
a_offset_inner += lda * 2; \
|
||||
b_offset += active * 2;
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||
uint64_t sve_size = svcntw();
|
||||
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
|
||||
|
||||
IFLOAT *a_offset, *a_offset_inner, *b_offset;
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
SV_TYPE a_vec;
|
||||
svbool_t pg_true = SV_TRUE();
|
||||
|
||||
BLASLONG single_vectors_n = n & -sve_size;
|
||||
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
|
||||
a_offset_inner = a_offset;
|
||||
|
||||
svbool_t pg = pg_true;
|
||||
uint64_t active = sve_size;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
a_offset += sve_size * 2;
|
||||
}
|
||||
|
||||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 2) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
BLASLONG remaining_n = n - single_vectors_n;
|
||||
if (remaining_n) {
|
||||
a_offset_inner = a_offset;
|
||||
svbool_t pg = SV_WHILE(0L, remaining_n);
|
||||
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
|
||||
uint64_t active = remaining_n;
|
||||
uint64_t i_cnt = m >> 2;
|
||||
while (i_cnt--) {
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#ifdef __NVCOMPILER
|
||||
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
|
||||
#if (NVCOMPVERS < 2309)
|
||||
#pragma opt 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#include "common.h"
|
||||
|
||||
|
|
|
@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M1
|
||||
|
@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
add pB, pB, 64
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_M2
|
||||
|
@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ri z23.d, p1/m, z2.d, z15.d
|
||||
ld1rd z15.d, p0/z, [pB, 56]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
add pB, pB, 64
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_E
|
||||
|
@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ir z23.d, p1/m, z3.d, z14.d
|
||||
OP_ii z22.d, p1/m, z3.d, z15.d
|
||||
OP_ri z23.d, p1/m, z2.d, z15.d
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNELv1x4_SUB
|
||||
|
@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
OP_ir z23.d, p1/m, z1.d, z14.d
|
||||
OP_ii z22.d, p1/m, z1.d, z15.d
|
||||
OP_ri z23.d, p1/m, z0.d, z15.d
|
||||
|
||||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
|
||||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
|
||||
.endm
|
||||
|
||||
.macro SAVEv1x4
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
|
@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #4
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z28.d, z29.d}, p1/z, [pCRow2]
|
||||
fmla z28.d, p1/m, z20.d, alphaz_R
|
||||
|
@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmla z31.d, p1/m, z23.d, alphaz_R
|
||||
st2d {z30.d, z31.d}, p1, [pCRow3]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x2
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
|
@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
st2d {z26.d, z27.d}, p1, [pCRow1]
|
||||
|
||||
add pCRow1, pCRow1, lanes, lsl #4
|
||||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
|
||||
|
||||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endm
|
||||
|
||||
.macro SAVEv1x1
|
||||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
|
||||
|
||||
ld2d {z24.d, z25.d}, p1/z, [pCRow0]
|
||||
fmla z24.d, p1/m, z16.d, alphaz_R
|
||||
fmls z24.d, p1/m, z17.d, alphaz_I
|
||||
|
@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
|
||||
|
||||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
|
||||
|
||||
.endm
|
||||
|
||||
/******************************************************************************/
|
||||
|
@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
prfm PLDL1KEEP, [origPA]
|
||||
|
||||
fmov alphaR, d0
|
||||
dup alphaz_R, alphaR
|
||||
fmov alphaI, d1
|
||||
|
@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
bne .Lzgemm_kernel_L4_Mv1_46
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_100:
|
||||
prfm PLDL1KEEP, [pA]
|
||||
prfm PLDL1KEEP, [pA, #64]
|
||||
prfm PLDL1KEEP, [origPB]
|
||||
|
||||
SAVEv1x4
|
||||
|
||||
.Lzgemm_kernel_L4_Mv1_END:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
|
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * lda * 2;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
boffset = b;
|
||||
|
||||
j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
uint64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
do {
|
||||
|
||||
|
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
|||
aoffset += active * 2;
|
||||
|
||||
j += svcntd();
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b64(offset, 0LL);
|
||||
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
|
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t j = 0;
|
||||
int32_t N = n;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
gat_ind = svadd_m(cmp, gat_ind, lda_vec);
|
||||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b32(offset, 0);
|
||||
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
|
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
data_vec_imag = svneg_z(pg, data_vec_imag);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b64(offset, 0LL);
|
||||
svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
|
@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
#else
|
||||
|
@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t j = 0;
|
||||
int32_t N = n;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
|
||||
data_vec_imag = svneg_z(pg, data_vec_imag);
|
||||
if (offset <= 0) {
|
||||
svbool_t off_g = svwhilelt_b32(offset, 0);
|
||||
svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
|
||||
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
|
||||
}
|
||||
|
||||
|
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
svint64_t one_vec = svdup_s64(1LL);
|
||||
|
||||
int64_t j = 0;
|
||||
svbool_t pg = svwhilelt_b64(j, n);
|
||||
svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
int64_t active = svcntp_b64(svptrue_b64(), pg);
|
||||
svint64_t index_neg = svindex_s64(0LL, -1LL);
|
||||
svint64_t index = svindex_s64(0LL, 1LL);
|
||||
|
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s64(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b64(j, n);
|
||||
pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
|
||||
active = svcntp_b64(svptrue_b64(), pg);
|
||||
} while (svptest_any(svptrue_b64(), pg));
|
||||
|
||||
|
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
|
||||
int32_t N = n;
|
||||
int32_t j = 0;
|
||||
svbool_t pg = svwhilelt_b32(j, N);
|
||||
svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
int32_t active = svcntp_b32(svptrue_b32(), pg);
|
||||
svint32_t index_neg = svindex_s32(0, -1);
|
||||
svint32_t index = svindex_s32(0, 1);
|
||||
|
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posX += sve_size;
|
||||
posX_vec = svdup_s32(posX);
|
||||
j += sve_size;
|
||||
pg = svwhilelt_b32(j, N);
|
||||
pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
|
||||
active = svcntp_b32(svptrue_b32(), pg);
|
||||
} while (svptest_any(svptrue_b32(), pg));
|
||||
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
#ifdef DOUBLE
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
FLOAT *ao;
|
||||
js = 0;
|
||||
#ifdef DOUBLE
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
svbool_t pn = svwhilelt_b32(js, n);
|
||||
svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do
|
||||
|
@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
|
|||
posY += n_active;
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, n);
|
||||
pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svint64_t index = svindex_s64(0LL, lda);
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svint32_t index = svindex_s32(0, lda);
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/*********************************************************************/
|
||||
/* Copyright 2009, 2010 The University of Texas at Austin. */
|
||||
/* Copyright 2023 The OpenBLAS Project */
|
||||
/* All rights reserved. */
|
||||
/* */
|
||||
/* Redistribution and use in source and binary forms, with or */
|
||||
|
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
jj = offset;
|
||||
#ifdef DOUBLE
|
||||
int64_t js = 0;
|
||||
svbool_t pn = svwhilelt_b64(js, n);
|
||||
svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
int n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
#else
|
||||
int32_t N = n;
|
||||
int32_t js = 0;
|
||||
svbool_t pn = svwhilelt_b32(js, N);
|
||||
svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
int n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
#endif
|
||||
do {
|
||||
|
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
js += n_active;
|
||||
#ifdef DOUBLE
|
||||
pn = svwhilelt_b64(js, n);
|
||||
pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
|
||||
n_active = svcntp_b64(svptrue_b64(), pn);
|
||||
} while (svptest_any(svptrue_b64(), pn));
|
||||
#else
|
||||
pn = svwhilelt_b32(js, N);
|
||||
pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
|
||||
n_active = svcntp_b32(svptrue_b32(), pn);
|
||||
} while (svptest_any(svptrue_b32(), pn));
|
||||
#endif
|
||||
|
|
|
@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
|
||||
BLASLONG i, ii, j, jj;
|
||||
|
||||
FLOAT data01, data02;
|
||||
FLOAT data01=0.0, data02=0.0;
|
||||
FLOAT *a1;
|
||||
|
||||
lda *= 2;
|
||||
|
|
|
@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
|
|||
FLOAT data05, data06, data07, data08;
|
||||
FLOAT *a1, *a2;
|
||||
|
||||
data01=data02=data07=data08=0.0;
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
ifndef NO_LASX
|
||||
DGEMMKERNEL = dgemm_kernel_16x4.S
|
||||
DGEMMINCOPY = dgemm_ncopy_16.S
|
||||
DGEMMITCOPY = dgemm_tcopy_16.S
|
||||
|
@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
DGEMVNKERNEL = dgemv_n_8_lasx.S
|
||||
DGEMVTKERNEL = dgemv_t_8_lasx.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_16x8_lasx.S
|
||||
SGEMMINCOPY = sgemm_ncopy_16_lasx.S
|
||||
SGEMMITCOPY = sgemm_tcopy_16_lasx.S
|
||||
SGEMMONCOPY = sgemm_ncopy_8_lasx.S
|
||||
SGEMMOTCOPY = sgemm_tcopy_8_lasx.S
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
|
|
@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c
|
|||
ZSWAPKERNEL = ../arm/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../arm/gemv_n.c
|
||||
ifndef DGEMVNKERNEL
|
||||
DGEMVNKERNEL = ../arm/gemv_n.c
|
||||
endif
|
||||
CGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../arm/zgemv_n.c
|
||||
|
||||
SGEMVTKERNEL = ../arm/gemv_t.c
|
||||
ifndef DGEMVTKERNEL
|
||||
DGEMVTKERNEL = ../arm/gemv_t.c
|
||||
endif
|
||||
CGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../arm/zgemv_t.c
|
||||
|
||||
|
|
|
@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmov.d s2, s1
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
beq $r0, INCX, .L999
|
||||
srai.d I, N, 2
|
||||
bge $r0, I, .L25
|
||||
LD a1, X, 0 * SIZE
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,546 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/07/14 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA $f0
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define Y_ORG $r15
|
||||
#define OFFSET $r16
|
||||
#define K_LDA $r17
|
||||
#define M8 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
|
||||
#define VALPHA $xr1
|
||||
#define X0 $xr2
|
||||
#define X1 $xr3
|
||||
#define X2 $xr4
|
||||
#define X3 $xr5
|
||||
#define X4 $xr6
|
||||
#define X5 $xr7
|
||||
#define X6 $xr8
|
||||
#define X7 $xr9
|
||||
#define Y0 $xr10
|
||||
#define Y1 $xr11
|
||||
#define A0 $xr12
|
||||
#define A1 $xr13
|
||||
#define A2 $xr14
|
||||
#define A3 $xr15
|
||||
#define A4 $xr16
|
||||
#define A5 $xr17
|
||||
#define A6 $xr18
|
||||
#define A7 $xr19
|
||||
#define A8 $xr20
|
||||
#define A9 $xr21
|
||||
#define A10 $xr22
|
||||
#define A11 $xr23
|
||||
#define A12 $xr24
|
||||
#define A13 $xr25
|
||||
#define A14 $xr26
|
||||
#define A15 $xr27
|
||||
|
||||
.macro DLOAD_X_8
|
||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
|
||||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
|
||||
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_4
|
||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_2
|
||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_1
|
||||
GLDREPL xv, d, X0, X, 0x00
|
||||
GMUL xvf, d, X0, X0, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_Y_8
|
||||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro DLOAD_Y_4
|
||||
GLD xv, , Y0, Y, 0
|
||||
.endm
|
||||
|
||||
.macro DLOAD_Y_1
|
||||
fld.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro DSTORE_Y_8
|
||||
GST xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro DSTORE_Y_4
|
||||
GST xv, , Y0, Y, 0
|
||||
.endm
|
||||
|
||||
.macro DSTORE_Y_1
|
||||
fst.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
// Unable to use vector load/store ins
|
||||
.macro DLOAD_Y_8_GAP
|
||||
fld.d $f10, Y, 0
|
||||
fldx.d $f13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
fld.d $f14, T0, 0
|
||||
fldx.d $f15, T0, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 2
|
||||
fld.d $f11, T0, 0
|
||||
fldx.d $f17, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
fld.d $f18, T0, 0
|
||||
fldx.d $f19, T0, INC_Y
|
||||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
|
||||
.endm
|
||||
|
||||
.macro DLOAD_Y_4_GAP
|
||||
fld.d $f10, Y, 0
|
||||
fldx.d $f13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
fld.d $f14, T0, 0
|
||||
fldx.d $f15, T0, INC_Y
|
||||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
|
||||
.endm
|
||||
|
||||
.macro DSTORE_Y_8_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 3
|
||||
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 0
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 3
|
||||
.endm
|
||||
|
||||
.macro DSTORE_Y_4_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 3
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_8_GAP
|
||||
xvldrepl.d X0, X, 0x00
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvldrepl.d X1, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X2, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X3, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X4, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X5, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X6, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X7, T0, 0x00
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
|
||||
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_4_GAP
|
||||
xvldrepl.d X0, X, 0x00
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvldrepl.d X1, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X2, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X3, T0, 0x00
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X_2_GAP
|
||||
xvldrepl.d X0, X, 0x00
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvldrepl.d X1, T0, 0x00
|
||||
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
|
||||
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
|
||||
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
|
||||
Y0, A6, X3, Y0, Y1, A7, X3, Y1, \
|
||||
Y0, A8, X4, Y0, Y1, A9, X4, Y1, \
|
||||
Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
|
||||
Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
|
||||
Y0, A14, X7, Y0, Y1, A15, X7, Y1
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_4x8
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, \
|
||||
A2, PA1, 0, \
|
||||
A4, PA2, 0, \
|
||||
A6, PA3, 0, \
|
||||
A8, PA4, 0, \
|
||||
A10, PA5, 0, \
|
||||
A12, PA6, 0, \
|
||||
A14, PA7, 0
|
||||
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, \
|
||||
Y0, A2, X1, Y0, \
|
||||
Y0, A4, X2, Y0, \
|
||||
Y0, A6, X3, Y0, \
|
||||
Y0, A8, X4, Y0, \
|
||||
Y0, A10, X5, Y0, \
|
||||
Y0, A12, X6, Y0, \
|
||||
Y0, A14, X7, Y0
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_1x8
|
||||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
|
||||
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
|
||||
GMADD f, d, $f10, $f12, $f2, $f10, \
|
||||
$f10, $f14, $f3, $f10, \
|
||||
$f10, $f16, $f4, $f10, \
|
||||
$f10, $f18, $f5, $f10, \
|
||||
$f10, $f20, $f6, $f10, \
|
||||
$f10, $f22, $f7, $f10, \
|
||||
$f10, $f24, $f8, $f10, \
|
||||
$f10, $f26, $f9, $f10,
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_8x4
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
|
||||
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
|
||||
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
|
||||
Y0, A6, X3, Y0, Y1, A7, X3, Y1
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_4x4
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
|
||||
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \
|
||||
Y0, A4, X2, Y0, Y0, A6, X3, Y0
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_1x4
|
||||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
|
||||
GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
|
||||
$f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_8x2
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
|
||||
Y0, A2, X1, Y0, Y1, A3, X1, Y1
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_4x2
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
|
||||
GMADD xvf, d, Y0, A0, X0, Y0, \
|
||||
Y0, A2, X1, Y0
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_1x2
|
||||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
|
||||
GMADD f, d, $f10, $f12, $f2, $f10, \
|
||||
$f10, $f14, $f3, $f10
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N_1x1
|
||||
fld.d $f12, PA0, 0
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
fmadd.d $f10, $f12, $f2, $f10
|
||||
.endm
|
||||
|
||||
.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
DLOAD_\X_8
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
DLOAD_\Y_8
|
||||
DGEMV_N_8x8
|
||||
DSTORE_\Y_8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
PTR_ADDI K, K, 8
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_M_3
|
||||
DLOAD_\Y_4
|
||||
DGEMV_N_4x8
|
||||
DSTORE_\Y_4
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
PTR_ADDI K, K, 4
|
||||
.L_\XW\()_M_3:
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
DLOAD_\Y_1
|
||||
DGEMV_N_1x8
|
||||
DSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 4
|
||||
beqz J, .L_\XW\()_N_3
|
||||
DLOAD_\X_4
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_N_4_M_7
|
||||
.align 5
|
||||
.L_\XW\()_N_4_M_L8:
|
||||
DLOAD_\Y_8
|
||||
DGEMV_N_8x4
|
||||
DSTORE_\Y_8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADDI K, K, 8
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
bnez I, .L_\XW\()_N_4_M_L8
|
||||
.L_\XW\()_N_4_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_N_4_M_3
|
||||
DLOAD_\Y_4
|
||||
DGEMV_N_4x4
|
||||
DSTORE_\Y_4
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
PTR_ADDI K, K, 4
|
||||
.L_\XW\()_N_4_M_3:
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_N_4_M_END
|
||||
.align 5
|
||||
.L_\XW\()_N_4_M_L1:
|
||||
DLOAD_\Y_1
|
||||
DGEMV_N_1x4
|
||||
DSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_4_M_L1
|
||||
.L_\XW\()_N_4_M_END:
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 2
|
||||
beqz J, .L_\XW\()_N_1
|
||||
DLOAD_\X_2
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_N_2_M_7
|
||||
.align 5
|
||||
.L_\XW\()_N_2_M_L8:
|
||||
DLOAD_\Y_8
|
||||
DGEMV_N_8x2
|
||||
DSTORE_\Y_8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADDI K, K, 8
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
bnez I, .L_\XW\()_N_2_M_L8
|
||||
.L_\XW\()_N_2_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_N_2_M_3
|
||||
DLOAD_\Y_4
|
||||
DGEMV_N_4x2
|
||||
DSTORE_\Y_4
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
PTR_ADDI K, K, 4
|
||||
.L_\XW\()_N_2_M_3:
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_N_2_M_END
|
||||
.align 5
|
||||
.L_\XW\()_N_2_M_L1:
|
||||
DLOAD_\Y_1
|
||||
DGEMV_N_1x2
|
||||
DSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_2_M_L1
|
||||
.L_\XW\()_N_2_M_END:
|
||||
PTR_SLLI K_LDA, LDA, 1
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD PA1, PA1, K_LDA
|
||||
PTR_ALSL X, INC_X, X, 1
|
||||
.L_\XW\()_N_1:
|
||||
andi J, N, 1
|
||||
beqz J, .L_END
|
||||
DLOAD_\X_1
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
DLOAD_\Y_1
|
||||
DGEMV_N_1x1
|
||||
DSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 24 + 4
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||
PTR_ALSL I, I, J, 1
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move Y_ORG, Y
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||
DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
|
||||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||
DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
|
||||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||
DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
|
||||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 24 + 4
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,468 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/07/17 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA $f0
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define PY0 $r14
|
||||
#define X_ORG $r15
|
||||
#define PY1 $r16
|
||||
#define K_LDA $r17
|
||||
#define PY2 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
#define M8 $r30
|
||||
|
||||
#define VALPHA $xr0
|
||||
#define X0 $xr1
|
||||
#define X1 $xr2
|
||||
#define A0 $xr3
|
||||
#define A1 $xr4
|
||||
#define A2 $xr5
|
||||
#define A3 $xr6
|
||||
#define A4 $xr7
|
||||
#define A5 $xr8
|
||||
#define A6 $xr9
|
||||
#define A7 $xr10
|
||||
#define A8 $xr11
|
||||
#define A9 $xr12
|
||||
#define A10 $xr13
|
||||
#define A11 $xr14
|
||||
#define A12 $xr15
|
||||
#define A13 $xr16
|
||||
#define A14 $xr17
|
||||
#define A15 $xr18
|
||||
#define TP0 $xr19
|
||||
#define TP1 $xr20
|
||||
#define TP2 $xr21
|
||||
#define TP3 $xr22
|
||||
#define TP4 $xr23
|
||||
#define TP5 $xr24
|
||||
#define TP6 $xr25
|
||||
#define TP7 $xr26
|
||||
#define Y0 $xr3
|
||||
#define Y1 $xr4
|
||||
#define Y2 $xr5
|
||||
#define Y3 $xr6
|
||||
#define Y4 $xr7
|
||||
#define Y5 $xr8
|
||||
#define Y6 $xr9
|
||||
#define Y7 $xr10
|
||||
|
||||
.macro ZERO_Y8
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
|
||||
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y4
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y2
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y1
|
||||
GXOR xv, v, TP0, TP0, TP0
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X8
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x20
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X4
|
||||
GLD xv, , X0, X, 0x00
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X8_GAP
|
||||
fld.d $f1, X, 0x00
|
||||
fldx.d $f2, X, INC_X
|
||||
PTR_ALSL T0, INC_X, X, 1
|
||||
fld.d $f3, T0, 0x00
|
||||
fldx.d $f4, T0, INC_X
|
||||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
|
||||
PTR_ALSL T0, INC_X, X, 2
|
||||
fld.d $f2, T0, 0x00
|
||||
fldx.d $f3, T0, INC_X
|
||||
PTR_ALSL T0, INC_X, T0, 1
|
||||
fld.d $f4, T0, 0x00
|
||||
fldx.d $f5, T0, INC_X
|
||||
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
|
||||
.endm
|
||||
|
||||
.macro DLOAD_X4_GAP
|
||||
fld.d $f1, X, 0x00
|
||||
fldx.d $f2, X, INC_X
|
||||
PTR_ALSL T0, INC_X, X, 1
|
||||
fld.d $f3, T0, 0x00
|
||||
fldx.d $f4, T0, INC_X
|
||||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
|
||||
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
|
||||
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
|
||||
TP3, A6, X0, TP3, TP3, A7, X1, TP3, \
|
||||
TP4, A8, X0, TP4, TP4, A9, X1, TP4, \
|
||||
TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
|
||||
TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
|
||||
TP7, A14, X0, TP7, TP7, A15, X1, TP7
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_8x4
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \
|
||||
A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
|
||||
TP2, A4, X0, TP2, TP3, A6, X0, TP3, \
|
||||
TP4, A8, X0, TP4, TP5, A10, X0, TP5, \
|
||||
TP6, A12, X0, TP6, TP7, A14, X0, TP7,
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_4x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
|
||||
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
|
||||
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
|
||||
TP3, A6, X0, TP3, TP3, A7, X1, TP3
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_4x4
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
|
||||
TP2, A4, X0, TP2, TP3, A6, X0, TP3
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_2x8
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
|
||||
TP1, A2, X0, TP1, TP1, A3, X1, TP1
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T_2x4
|
||||
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
|
||||
|
||||
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1
|
||||
.endm
|
||||
|
||||
.macro DGEMV_T XW:req X8:req, X4:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
ZERO_Y8
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
DLOAD_\X8
|
||||
DGEMV_T_8x8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_M_3
|
||||
DLOAD_\X4
|
||||
DGEMV_T_8x4
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
.L_\XW\()_M_3:
|
||||
// Accumulated
|
||||
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
|
||||
Y5, TP5, Y6, TP6, Y7, TP7
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
fld.d $f1, X, 0x00
|
||||
fld.d $f11, PA0, 0x00
|
||||
fld.d $f12, PA1, 0x00
|
||||
fld.d $f13, PA2, 0x00
|
||||
fld.d $f14, PA3, 0x00
|
||||
fld.d $f15, PA4, 0x00
|
||||
fld.d $f16, PA5, 0x00
|
||||
fld.d $f17, PA6, 0x00
|
||||
fld.d $f18, PA7, 0x00
|
||||
#if __loongarch_grlen == 64
|
||||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#else
|
||||
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#endif
|
||||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
|
||||
$f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
fld.d $f11, Y, 0x00
|
||||
fldx.d $f12, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
fld.d $f13, PY0, 0x00
|
||||
fldx.d $f14, PY0, INC_Y
|
||||
PTR_ALSL PY1, INC_Y, Y, 2
|
||||
fld.d $f15, PY1, 0x00
|
||||
fldx.d $f16, PY1, INC_Y
|
||||
PTR_ALSL PY2, INC_Y, PY1, 1
|
||||
fld.d $f17, PY2, 0x00
|
||||
fldx.d $f18, PY2, INC_Y
|
||||
|
||||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
|
||||
$f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
|
||||
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
fst.d $f11, Y, 0x00
|
||||
fstx.d $f12, Y, INC_Y
|
||||
fst.d $f13, PY0, 0x00
|
||||
fstx.d $f14, PY0, INC_Y
|
||||
fst.d $f15, PY1, 0x00
|
||||
fstx.d $f16, PY1, INC_Y
|
||||
fst.d $f17, PY2, 0x00
|
||||
fstx.d $f18, PY2, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 4
|
||||
beqz J, .L_\XW\()_N_3
|
||||
ZERO_Y4
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_N_4_M_7
|
||||
.align 5
|
||||
.L_\XW\()_N_4_M_L8:
|
||||
DLOAD_\X8
|
||||
DGEMV_T_4x8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez I, .L_\XW\()_N_4_M_L8
|
||||
.L_\XW\()_N_4_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_N_4_M_3
|
||||
DLOAD_\X4
|
||||
DGEMV_T_4x4
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
.L_\XW\()_N_4_M_3:
|
||||
// Accumulated
|
||||
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_N_4_M_END
|
||||
.align 5
|
||||
.L_\XW\()_N_4_M_L1:
|
||||
fld.d $f1, X, 0x00
|
||||
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
|
||||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_N_4_M_L1
|
||||
.L_\XW\()_N_4_M_END:
|
||||
fld.d $f11, Y, 0x00
|
||||
fldx.d $f12, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
fld.d $f13, PY0, 0x00
|
||||
fldx.d $f14, PY0, INC_Y
|
||||
|
||||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
|
||||
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
fst.d $f11, Y, 0x00
|
||||
fstx.d $f12, Y, INC_Y
|
||||
fst.d $f13, PY0, 0x00
|
||||
fstx.d $f14, PY0, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 2
|
||||
beqz J, .L_\XW\()_N_1
|
||||
ZERO_Y2
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_N_2_M_7
|
||||
.align 5
|
||||
.L_\XW\()_N_2_M_L8:
|
||||
DLOAD_\X8
|
||||
DGEMV_T_2x8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez I, .L_\XW\()_N_2_M_L8
|
||||
.L_\XW\()_N_2_M_7:
|
||||
andi I, M, 4
|
||||
beqz I, .L_\XW\()_N_2_M_3
|
||||
DLOAD_\X4
|
||||
DGEMV_T_2x4
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
.L_\XW\()_N_2_M_3:
|
||||
// Accumulated
|
||||
GACC xvf, d, Y0, TP0, Y1, TP1
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_N_2_M_END
|
||||
.align 5
|
||||
.L_\XW\()_N_2_M_L1:
|
||||
fld.d $f1, X, 0x00
|
||||
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
|
||||
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_N_2_M_L1
|
||||
.L_\XW\()_N_2_M_END:
|
||||
fld.d $f11, Y, 0x00
|
||||
fldx.d $f12, Y, INC_Y
|
||||
|
||||
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
|
||||
|
||||
PTR_SLLI K_LDA, LDA, 1
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||
#else
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
|
||||
#endif
|
||||
fst.d $f11, Y, 0x00
|
||||
fstx.d $f12, Y, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 1
|
||||
.L_\XW\()_N_1:
|
||||
andi J, N, 1
|
||||
beqz J, .L_END
|
||||
ZERO_Y1
|
||||
move X, X_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
fld.d $f3, PA0, 0x00
|
||||
fld.d $f1, X, 0x00
|
||||
fmadd.d $f19, $f3, $f1, $f19
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
fld.d $f3, Y, 0x00
|
||||
fmadd.d $f3, ALPHA, $f19, $f3
|
||||
fst.d $f3, Y, 0x00
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 24 + 3
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move X_ORG, X
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||
.L_GAP_0: /* if (incx == 1) */
|
||||
DGEMV_T GAP_0, X8, X4
|
||||
.L_GAP_1: /* if (incx != 1) */
|
||||
DGEMV_T GAP_1, X8_GAP, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 24 + 3
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
beq $r0, INCX, .L999
|
||||
move XX, X
|
||||
NOP
|
||||
LD a1, X, 0 * SIZE
|
||||
|
|
|
@ -0,0 +1,407 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#if __loongarch_grlen == 64
|
||||
#define LA_REG int64_t
|
||||
#define REG_SIZE 8
|
||||
#define REG_LOG 3
|
||||
#define PTR_ADDI addi.d
|
||||
#define PTR_ADD add.d
|
||||
#define PTR_SUB sub.d
|
||||
#define PTR_LD ld.d
|
||||
#define PTR_ST st.d
|
||||
#define PTR_SLLI slli.d
|
||||
#define PTR_SRLI srli.d
|
||||
#define PTR_SRAI srai.d
|
||||
#define PTR_MUL mul.d
|
||||
#define PTR_ALSL alsl.d
|
||||
#else
|
||||
#define LA_REG int32_t
|
||||
#define REG_SIZE 4
|
||||
#define REG_LOG 2
|
||||
#define PTR_ADDI addi.w
|
||||
#define PTR_ADD add.w
|
||||
#define PTR_SUB sub.w
|
||||
#define PTR_LD ld.w
|
||||
#define PTR_ST st.w
|
||||
#define PTR_SLLI slli.w
|
||||
#define PTR_SRLI srli.w
|
||||
#define PTR_SRAI srai.w
|
||||
#define PTR_MUL mul.w
|
||||
#define PTR_ALSL alsl.w
|
||||
#endif
|
||||
|
||||
#if __loongarch_frlen == 64
|
||||
#define FREG_SIZE 8
|
||||
#define FREG_LOG 3
|
||||
#define PTR_FLD fld.d
|
||||
#define PTR_FST fst.d
|
||||
#else
|
||||
#define FREG_SIZE 4
|
||||
#define FREG_LOG 2
|
||||
#define PTR_FLD fld.s
|
||||
#define PTR_FST fst.s
|
||||
#endif
|
||||
|
||||
// The max registers available to the user which
|
||||
// do not need to be preserved across calls.
|
||||
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
|
||||
#define MAX_INT_CALLER_SAVED 17
|
||||
#define MAX_FP_CALLER_SAVED 24
|
||||
|
||||
.altmacro // Enable alternate macro mode
|
||||
|
||||
.macro push_if_used regs, fregs
|
||||
.if \regs > MAX_INT_CALLER_SAVED
|
||||
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
|
||||
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
|
||||
.endif
|
||||
.if \fregs > MAX_FP_CALLER_SAVED
|
||||
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
|
||||
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
|
||||
.endif
|
||||
.endm // End push_if_used
|
||||
.macro pop_if_used regs, fregs
|
||||
.if \fregs > MAX_FP_CALLER_SAVED
|
||||
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
|
||||
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
|
||||
.endif
|
||||
.if \regs > MAX_INT_CALLER_SAVED
|
||||
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
|
||||
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
|
||||
.endif
|
||||
.endm // End pop_if_used
|
||||
.macro push_regs from, to
|
||||
PTR_ST $s\()\from, $sp, \from << REG_LOG
|
||||
.if \to - \from
|
||||
push_regs %from + 1, \to
|
||||
.endif
|
||||
.endm // End push_regs
|
||||
.macro pop_regs from, to
|
||||
PTR_LD $s\()\from, $sp, \from << REG_LOG
|
||||
.if \to - \from
|
||||
pop_regs %from + 1, \to
|
||||
.endif
|
||||
.endm // End pop_regs
|
||||
.macro push_fregs from, to
|
||||
PTR_FST $fs\()\from, $sp, \from << FREG_LOG
|
||||
.if \to - \from
|
||||
push_fregs %from + 1, \to
|
||||
.endif
|
||||
.endm // End push_fregs
|
||||
.macro pop_fregs from, to
|
||||
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
|
||||
.if \to - \from
|
||||
pop_fregs %from + 1, \to
|
||||
.endif
|
||||
.endm // End pop_fregs
|
||||
|
||||
//
|
||||
// Instruction Related Macros
|
||||
//
|
||||
// GLD
|
||||
//
|
||||
.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
|
||||
.ifeqs "\suf_op", "0"
|
||||
\pre_op\()ld \out, \src, \offset
|
||||
.else
|
||||
\pre_op\()ld.\suf_op \out, \src, \offset
|
||||
.endif
|
||||
.ifnb \more
|
||||
GLD \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GLD_INC
|
||||
//
|
||||
.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
|
||||
.ifeqs "\suf_op", "0"
|
||||
\pre_op\()ld \out, \src, \offset
|
||||
.else
|
||||
\pre_op\()ld.\suf_op \out, \src, \offset
|
||||
.endif
|
||||
PTR_ADDI \src, \src, \inc
|
||||
.ifnb \more
|
||||
GLD_INC \pre_op, \suf_op, \inc, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GLDX is same as GLD except the stride is a register
|
||||
//
|
||||
.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
|
||||
.ifeqs "\suf_op", "0"
|
||||
\pre_op\()ldx \out, \src, \offset
|
||||
.else
|
||||
\pre_op\()ldx.\suf_op \out, \src, \offset
|
||||
.endif
|
||||
.ifnb \more
|
||||
GLDX \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GLDREPL
|
||||
//
|
||||
.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
|
||||
\pre_op\()ldrepl.\suf_op \out, \src, \offset
|
||||
.ifnb \more
|
||||
GLDREPL \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GST
|
||||
//
|
||||
.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
|
||||
.ifeqs "\suf_op", "0"
|
||||
\pre_op\()st \src, \dst, \offset
|
||||
.else
|
||||
\pre_op\()st.\suf_op \src, \dst, \offset
|
||||
.endif
|
||||
.ifnb \more
|
||||
GST \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GMUL
|
||||
//
|
||||
.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()mul.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GMUL \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GMADD
|
||||
//
|
||||
.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
|
||||
\pre_op\()madd.\suf_op \out, \in0, \in1, \in2
|
||||
.ifnb \more
|
||||
GMADD \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GADD
|
||||
//
|
||||
.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()add.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GADD \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GADDI
|
||||
//
|
||||
.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()addi.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GADDI \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GSUB
|
||||
//
|
||||
.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()sub.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GSUB \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GSLLI
|
||||
//
|
||||
.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()slli.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GSLLI \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GINSVE0
|
||||
//
|
||||
.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()insve0.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GINSVE0 \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GXOR
|
||||
//
|
||||
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()xor.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GXOR \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPERMI
|
||||
//
|
||||
.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
|
||||
\pre_op\()permi.\suf_op \out, \in0, \in1
|
||||
.ifnb \more
|
||||
GPERMI \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GNMSUB
|
||||
//
|
||||
.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
|
||||
\pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2
|
||||
.ifnb \more
|
||||
GNMSUB \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GPRELD
|
||||
//
|
||||
.macro GPRELD in0:req, in1:req, in2:req, more:vararg
|
||||
preld \in0, \in1, \in2
|
||||
.ifnb \more
|
||||
GPRELD \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Compound instructions
|
||||
//
|
||||
// GACC: Accumulate the values of vector registers
|
||||
//
|
||||
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "vf"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "xv"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "d"
|
||||
xvpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "w"
|
||||
xvpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "h"
|
||||
xvpackod.b \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "v"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "d"
|
||||
vpackod.w \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "w"
|
||||
vpackod.h \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.ifnc "\suf_op", "h"
|
||||
vpackod.b \in, \out, \out
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifnb \more
|
||||
GACC \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
//
|
||||
// GMOV
|
||||
//
|
||||
.macro GMOV pre_op:req, out:req, in:req, more:vararg
|
||||
\pre_op\()or.v \out, \in, \in
|
||||
.ifnb \more
|
||||
GMOV \pre_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Media Related Macros
|
||||
//
|
||||
.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
|
||||
\pre_op\()ilvl.\suf_op \out0, \in0, \in1
|
||||
\pre_op\()ilvh.\suf_op \out1, \in0, \in1
|
||||
.endm
|
||||
.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
|
||||
\pre_op\()pickev.\suf_op \out0, \in0, \in1
|
||||
\pre_op\()pickod.\suf_op \out1, \in0, \in1
|
||||
.endm
|
||||
|
||||
//
|
||||
// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
|
||||
// has no pre_op param. 128-bit vector instructions are not supported.
|
||||
//
|
||||
.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
|
||||
vt0, vt1
|
||||
GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
|
||||
GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
|
||||
GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
|
||||
GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
|
||||
.endm
|
||||
|
||||
.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
|
||||
in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
tmp0, tmp1, tmp2, tmp3
|
||||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
|
||||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
|
||||
GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
|
||||
GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
|
||||
|
||||
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
|
||||
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
|
||||
GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
|
||||
GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
|
||||
|
||||
GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
|
||||
|
||||
GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
|
||||
\out2, \out6, 0x02, \out3, \out7, 0x02, \
|
||||
\out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
|
||||
\out6, \tmp2, 0x31, \out7, \tmp3, 0x31
|
||||
.endm
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,463 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define S11 $r24
|
||||
#define S12 $r25
|
||||
#define S13 $r26
|
||||
#define S14 $r27
|
||||
#define S15 $r28
|
||||
#define S16 $r29
|
||||
#define TD $r30
|
||||
#define TS $r31
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define D12 $xr28
|
||||
#define D13 $xr29
|
||||
#define D14 $xr30
|
||||
#define D15 $xr31
|
||||
|
||||
// Loops outline
|
||||
//.L_N16 <-------------------
|
||||
//| .L_M8: |
|
||||
//| .L_M7: | Main Loop
|
||||
//| .L_M1: |
|
||||
//| .L_M0: ---------------
|
||||
//.L_N15:
|
||||
//.L_N8:
|
||||
//| .L_N8_M8:
|
||||
//| .L_N8_M7:
|
||||
//| .L_N8_M1:
|
||||
//.L_N7:
|
||||
//.L_N4:
|
||||
//| .L_N4_M4:
|
||||
//| .L_N4_M3:
|
||||
//| .L_N4_M1:
|
||||
//.L_N3:
|
||||
//.L_N2:
|
||||
//| .L_N2_M2:
|
||||
//| .L_N2_M1:
|
||||
//.L_N1:
|
||||
//| .L_N1_M1:
|
||||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 26, 32
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SRAI J, N, 0x04
|
||||
beq J, ZERO, .L_N15
|
||||
.align 5
|
||||
.L_N16:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_ADD S4, S3, TL
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S9, S7, T0
|
||||
PTR_ADD S10, S8, T0
|
||||
PTR_ADD S11, S9, T0
|
||||
PTR_ADD S12, S10, T0
|
||||
PTR_ADD S13, S11, T0
|
||||
PTR_ADD S14, S12, T0
|
||||
PTR_ADD S15, S13, T0
|
||||
PTR_ADD S16, S14, T0
|
||||
PTR_ADD TS, S15, T0
|
||||
beq I, ZERO, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
xvld U8, S9, 0x00
|
||||
xvld U9, S10, 0x00
|
||||
xvld U10, S11, 0x00
|
||||
xvld U11, S12, 0x00
|
||||
xvld U12, S13, 0x00
|
||||
xvld U13, S14, 0x00
|
||||
xvld U14, S15, 0x00
|
||||
xvld U15, S16, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
|
||||
U8, U9, U10, U11, U12, U13, U14, U15, \
|
||||
U0, U1, U2, U3 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
|
||||
D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
|
||||
D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI S9, S9, 0x20
|
||||
PTR_ADDI S10, S10, 0x20
|
||||
PTR_ADDI S11, S11, 0x20
|
||||
PTR_ADDI S12, S12, 0x20
|
||||
PTR_ADDI S13, S13, 0x20
|
||||
PTR_ADDI S14, S14, 0x20
|
||||
PTR_ADDI S15, S15, 0x20
|
||||
PTR_ADDI S16, S16, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M8
|
||||
.L_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_M0
|
||||
.align 5
|
||||
.L_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0C
|
||||
fst.s F4, TD, 0x10
|
||||
fst.s F5, TD, 0x14
|
||||
fst.s F6, TD, 0x18
|
||||
fst.s F7, TD, 0x1C
|
||||
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
|
||||
fld.s F0, S9, 0x00
|
||||
fld.s F1, S10, 0x00
|
||||
fld.s F2, S11, 0x00
|
||||
fld.s F3, S12, 0x00
|
||||
fld.s F4, S13, 0x00
|
||||
fld.s F5, S14, 0x00
|
||||
fld.s F6, S15, 0x00
|
||||
fld.s F7, S16, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0C
|
||||
fst.s F4, TD, 0x10
|
||||
fst.s F5, TD, 0x14
|
||||
fst.s F6, TD, 0x18
|
||||
fst.s F7, TD, 0x1C
|
||||
|
||||
PTR_ADDI S9, S9, 0x04
|
||||
PTR_ADDI S10, S10, 0x04
|
||||
PTR_ADDI S11, S11, 0x04
|
||||
PTR_ADDI S12, S12, 0x04
|
||||
PTR_ADDI S13, S13, 0x04
|
||||
PTR_ADDI S14, S14, 0x04
|
||||
PTR_ADDI S15, S15, 0x04
|
||||
PTR_ADDI S16, S16, 0x04
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M1
|
||||
.L_M0:
|
||||
blt ZERO, J, .L_N16
|
||||
.L_N15:
|
||||
andi J, N, 0x0f
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x08
|
||||
beq ZERO, J, .L_N7
|
||||
.L_N8:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD TS, S7, T0
|
||||
beq I, ZERO, .L_N8_M7
|
||||
.align 5
|
||||
.L_N8_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
|
||||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N8_M8
|
||||
.L_N8_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_N7
|
||||
.align 5
|
||||
.L_N8_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
fst.s F4, TD, 0x10
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
fst.s F5, TD, 0x14
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
fst.s F6, TD, 0x18
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
fst.s F7, TD, 0x1C
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N8_M1
|
||||
.L_N7:
|
||||
andi J, N, 0x07
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N3
|
||||
.L_N4:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x02
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD TS, S3, T0
|
||||
beq I, ZERO, .L_N4_M3
|
||||
.align 5
|
||||
.L_N4_M4:
|
||||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
|
||||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
|
||||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
|
||||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
|
||||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
|
||||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI TD, TD, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M4
|
||||
.L_N4_M3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N3
|
||||
.align 5
|
||||
.L_N4_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M1
|
||||
.L_N3:
|
||||
andi J, N, 0x03
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
.L_N2:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x01
|
||||
PTR_ADD TS, S2, TL
|
||||
beq I, ZERO, .L_N2_M1
|
||||
.align 5
|
||||
.L_N2_M2:
|
||||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
|
||||
vilvl.w $vr0, $vr1, $vr0
|
||||
GST v, , $vr0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N2_M2
|
||||
.L_N2_M1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI TD, TD, 0x08
|
||||
.align 5
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_N1_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI TD, TD, 0x04
|
||||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 26, 32
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,298 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
#define D8 $xr16
|
||||
#define D10 $xr17
|
||||
#define D12 $xr18
|
||||
#define D14 $xr19
|
||||
|
||||
// Loops outline
|
||||
//.L_N8: <----------------
|
||||
//| .L_M8: |
|
||||
//| .L_M7: | Main Loop
|
||||
//| .L_M1: |
|
||||
//| .L_M0:--------------
|
||||
//.L_N7:
|
||||
//.L_N4:
|
||||
//| .L_N4_M4:
|
||||
//| .L_N4_M3:
|
||||
//| .L_N4_M1:
|
||||
//.L_N3:
|
||||
//.L_N2:
|
||||
//| .L_N2_M2:
|
||||
//| .L_N2_M1:
|
||||
//.L_N1:
|
||||
//| .L_N1_M1:
|
||||
//.L_N0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 17, 20
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SRAI J, N, 0x03
|
||||
beq J, ZERO, .L_N7
|
||||
.align 5
|
||||
.L_N8:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x03
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD TS, S7, T0
|
||||
beq I, ZERO, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
|
||||
U0, U1, U2, U3, U4, U5, U6, U7, \
|
||||
D1, D3, D5, D7 // As tmp
|
||||
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
|
||||
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
|
||||
PTR_ADDI TD, TD, 0x100
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M8
|
||||
.L_M7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_M0
|
||||
.align 5
|
||||
.L_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
fld.s F4, S5, 0x00
|
||||
fld.s F5, S6, 0x00
|
||||
fld.s F6, S7, 0x00
|
||||
fld.s F7, S8, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
fst.s F4, TD, 0x10
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
fst.s F5, TD, 0x14
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
fst.s F6, TD, 0x18
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
fst.s F7, TD, 0x1C
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_M1
|
||||
.L_M0:
|
||||
blt ZERO, J, .L_N8
|
||||
.L_N7:
|
||||
andi J, N, 0x07
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N3
|
||||
.L_N4:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x02
|
||||
PTR_ADD S3, S2, TL
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD TS, S3, T0
|
||||
beq I, ZERO, .L_N4_M3
|
||||
.align 5
|
||||
.L_N4_M4:
|
||||
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
|
||||
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
|
||||
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
|
||||
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
|
||||
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
|
||||
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI TD, TD, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M4
|
||||
.L_N4_M3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N3
|
||||
.align 5
|
||||
.L_N4_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
fld.s F2, S3, 0x00
|
||||
fld.s F3, S4, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
fst.s F3, TD, 0x0C
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N4_M1
|
||||
.L_N3:
|
||||
andi J, N, 0x03
|
||||
beq ZERO, J, .L_N0
|
||||
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
.L_N2:
|
||||
move S1, TS
|
||||
PTR_ADD S2, TS, TL
|
||||
PTR_SRAI I, M, 0x01
|
||||
PTR_ADD TS, S2, TL
|
||||
beq I, ZERO, .L_N2_M1
|
||||
.align 5
|
||||
.L_N2_M2:
|
||||
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
|
||||
vilvl.w $vr0, $vr1, $vr0
|
||||
GST v, , $vr0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI TD, TD, 0x10
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
blt ZERO, I, .L_N2_M2
|
||||
.L_N2_M1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S2, 0x00
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F1, TD, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI TD, TD, 0x08
|
||||
.align 5
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
.L_N1_M1:
|
||||
fld.s F0, S1, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
fst.s F0, TD, 0x00
|
||||
PTR_ADDI TD, TD, 0x04
|
||||
PTR_ADDI M, M, -1
|
||||
blt ZERO, M, .L_N1_M1
|
||||
.L_N0:
|
||||
pop_if_used 17, 20
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,526 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define P5 $r27
|
||||
#define T0 $r28
|
||||
#define T1 $r29
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
// Loops outline
|
||||
//.L_M8 <-------------------
|
||||
//| .L_N16: |
|
||||
//| .L_N15: |
|
||||
//| .L_N8: |
|
||||
//| .L_N7: | Main Loop
|
||||
//| .L_N4: |
|
||||
//| .L_N3: |
|
||||
//| .L_N2: |
|
||||
//| .L_N1: |
|
||||
//| .L_N0: ---------------
|
||||
//.L_M7
|
||||
//.L_M4
|
||||
//| .L_M4_N16:
|
||||
//| .L_M4_N15:
|
||||
//| .L_M4_N8:
|
||||
//| .L_M4_N7:
|
||||
//| .L_M4_N4:
|
||||
//| .L_M4_N3:
|
||||
//| .L_M4_N2:
|
||||
//| .L_M4_N1:
|
||||
//.L_M3
|
||||
//.L_M2
|
||||
//| .L_M2_N16:
|
||||
//| .L_M2_N15:
|
||||
//| .L_M2_N8:
|
||||
//| .L_M2_N7:
|
||||
//| .L_M2_N4:
|
||||
//| .L_M2_N3:
|
||||
//| .L_M2_N2:
|
||||
//| .L_M2_N1:
|
||||
//.L_M1
|
||||
//| .L_M1_N16:
|
||||
//| .L_M1_N15:
|
||||
//| .L_M1_N8:
|
||||
//| .L_M1_N7:
|
||||
//| .L_M1_N4:
|
||||
//| .L_M1_N3:
|
||||
//| .L_M1_N2:
|
||||
//| .L_M1_N1:
|
||||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 24, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
PTR_SRAI T0, N, 0x04
|
||||
PTR_SRAI T1, N, 0x03
|
||||
PTR_SLLI T0, T0, 0x04
|
||||
PTR_SLLI T1, T1, 0x03
|
||||
|
||||
PTR_MUL P2, M, T0
|
||||
PTR_MUL P3, M, T1
|
||||
PTR_SLLI P2, P2, 0x02
|
||||
PTR_SLLI P3, P3, 0x02
|
||||
PTR_ADD P2, DST, P2
|
||||
PTR_ADD P3, DST, P3
|
||||
|
||||
PTR_SRAI T0, N, 0x02
|
||||
PTR_SRAI T1, N, 0x01
|
||||
PTR_SLLI T0, T0, 0x02
|
||||
PTR_SLLI T1, T1, 0x01
|
||||
PTR_MUL P4, M, T0
|
||||
PTR_MUL P5, M, T1
|
||||
PTR_SLLI P4, P4, 0x02
|
||||
PTR_SLLI P5, P5, 0x02
|
||||
PTR_ADD P4, DST, P4
|
||||
PTR_ADD P5, DST, P5
|
||||
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SRAI J, M, 0x03
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SLLI T1, M, 0x06
|
||||
beq ZERO, J, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x200
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
PTR_ADDI J, J, -1
|
||||
beq ZERO, I, .L_N15
|
||||
.L_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S6, 0x00
|
||||
xvld U3, S6, 0x20
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
|
||||
xvld U4, S7, 0x00
|
||||
xvld U5, S7, 0x20
|
||||
xvld U6, S8, 0x00
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI S3, S3, 0x40
|
||||
PTR_ADDI S4, S4, 0x40
|
||||
PTR_ADDI S5, S5, 0x40
|
||||
PTR_ADDI S6, S6, 0x40
|
||||
PTR_ADDI S7, S7, 0x40
|
||||
PTR_ADDI S8, S8, 0x40
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_N16
|
||||
.L_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_N7
|
||||
.L_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
|
||||
U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
PTR_ADDI P2, P2, 0x100
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
.L_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
|
||||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
|
||||
$vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI S5, S5, 0x10
|
||||
PTR_ADDI S6, S6, 0x10
|
||||
PTR_ADDI S7, S7, 0x10
|
||||
PTR_ADDI S8, S8, 0x10
|
||||
PTR_ADDI P3, P3, 0x80
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
.L_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
|
||||
$f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI S5, S5, 0x08
|
||||
PTR_ADDI S6, S6, 0x08
|
||||
PTR_ADDI S7, S7, 0x08
|
||||
PTR_ADDI S8, S8, 0x08
|
||||
PTR_ADDI P4, P4, 0x40
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
|
||||
$f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI P5, P5, 0x20
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_M8
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
.L_M4:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x100
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N15
|
||||
.align 5
|
||||
.L_M4_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI S3, S3, 0x40
|
||||
PTR_ADDI S4, S4, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M4_N16
|
||||
.L_M4_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M4_N7
|
||||
.L_M4_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI P2, P2, 0x80
|
||||
.L_M4_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N3
|
||||
.L_M4_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI P3, P3, 0x40
|
||||
.L_M4_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M4_N1
|
||||
.L_M4_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI P4, P4, 0x20
|
||||
.L_M4_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI P5, P5, 0x10
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
.L_M2:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x80
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N15
|
||||
.align 5
|
||||
.L_M2_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI S2, S2, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M2_N16
|
||||
.L_M2_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M2_N7
|
||||
.L_M2_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00, U1, P2, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI P2, P2, 0x40
|
||||
.L_M2_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N3
|
||||
.L_M2_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
|
||||
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI P3, P3, 0x20
|
||||
.L_M2_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M2_N1
|
||||
.L_M2_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI P4, P4, 0x10
|
||||
.L_M2_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI P5, P5, 0x08
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x40
|
||||
|
||||
PTR_SRAI I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N15
|
||||
.align 5
|
||||
.L_M1_N16:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x40
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M1_N16
|
||||
.L_M1_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_M1_N7
|
||||
.L_M1_N8:
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
GST xv, , U0, P2, 0x00
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI P2, P2, 0x20
|
||||
.L_M1_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N3
|
||||
.L_M1_N4:
|
||||
GLD v, , $vr0, S1, 0x00
|
||||
GST v, , $vr0, P3, 0x00
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI P3, P3, 0x10
|
||||
.L_M1_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1_N1
|
||||
.L_M1_N2:
|
||||
GLD f, d, $f0, S1, 0x00
|
||||
GST f, d, $f0, P4, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI P4, P4, 0x08
|
||||
.L_M1_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00
|
||||
GST f, s, $f0, P5, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P5, P5, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 24, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
|
@ -0,0 +1,406 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2023, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2023/08/23 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*********************************************************************/
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define T0 $r27
|
||||
#define T1 $r28
|
||||
#define TL $r7
|
||||
#undef ZERO
|
||||
#define ZERO $r0
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
// Loops outline
|
||||
//.L_M8 <-------------------
|
||||
//| .L_N8: |
|
||||
//| .L_N7: | Main Loop
|
||||
//| .L_N4: |
|
||||
//| .L_N3: |
|
||||
//| .L_N2: |
|
||||
//| .L_N1: |
|
||||
//| .L_N0: ---------------
|
||||
//.L_M7
|
||||
//.L_M4
|
||||
//| .L_M4_N8:
|
||||
//| .L_M4_N7:
|
||||
//| .L_M4_N4:
|
||||
//| .L_M4_N3:
|
||||
//| .L_M4_N2:
|
||||
//| .L_M4_N1:
|
||||
//.L_M3
|
||||
//.L_M2
|
||||
//| .L_M2_N8:
|
||||
//| .L_M2_N7:
|
||||
//| .L_M2_N4:
|
||||
//| .L_M2_N3:
|
||||
//| .L_M2_N2:
|
||||
//| .L_M2_N1:
|
||||
//.L_M1
|
||||
//| .L_M1_N8:
|
||||
//| .L_M1_N7:
|
||||
//| .L_M1_N4:
|
||||
//| .L_M1_N3:
|
||||
//| .L_M1_N2:
|
||||
//| .L_M1_N1:
|
||||
//.L_M0
|
||||
|
||||
PROLOGUE
|
||||
push_if_used 23, 8
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
PTR_SRAI T0, N, 0x04
|
||||
PTR_SRAI T1, N, 0x03
|
||||
PTR_SLLI T0, T0, 0x04
|
||||
PTR_SLLI T1, T1, 0x03
|
||||
|
||||
PTR_MUL P2, M, T1
|
||||
PTR_SLLI P2, P2, 0x02
|
||||
PTR_ADD P2, DST, P2
|
||||
PTR_SRAI T0, N, 0x02
|
||||
PTR_SRAI T1, N, 0x01
|
||||
PTR_SLLI T0, T0, 0x02
|
||||
PTR_SLLI T1, T1, 0x01
|
||||
PTR_MUL P3, M, T0
|
||||
PTR_MUL P4, M, T1
|
||||
PTR_SLLI P3, P3, 0x02
|
||||
PTR_SLLI P4, P4, 0x02
|
||||
PTR_ADD P3, DST, P3
|
||||
PTR_ADD P4, DST, P4
|
||||
|
||||
PTR_SLLI TL, LDA, 0x02
|
||||
PTR_SRAI J, M, 0x03
|
||||
PTR_SLLI T0, TL, 0x01
|
||||
PTR_SLLI T1, M, 0x05
|
||||
beq ZERO, J, .L_M7
|
||||
.align 5
|
||||
.L_M8:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S5, S3, T0
|
||||
PTR_ADD S6, S4, T0
|
||||
PTR_ADD S7, S5, T0
|
||||
PTR_ADD S8, S6, T0
|
||||
PTR_ADD S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x100
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
PTR_ADDI J, J, -1
|
||||
beq ZERO, I, .L_N7
|
||||
.L_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
|
||||
U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
PTR_ADDI S5, S5, 0x20
|
||||
PTR_ADDI S6, S6, 0x20
|
||||
PTR_ADDI S7, S7, 0x20
|
||||
PTR_ADDI S8, S8, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_N8
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
.L_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
|
||||
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
|
||||
$vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI S5, S5, 0x10
|
||||
PTR_ADDI S6, S6, 0x10
|
||||
PTR_ADDI S7, S7, 0x10
|
||||
PTR_ADDI S8, S8, 0x10
|
||||
PTR_ADDI P2, P2, 0x80
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
.L_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
|
||||
$f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI S5, S5, 0x08
|
||||
PTR_ADDI S6, S6, 0x08
|
||||
PTR_ADDI S7, S7, 0x08
|
||||
PTR_ADDI S8, S8, 0x08
|
||||
PTR_ADDI P3, P3, 0x40
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
|
||||
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
|
||||
$f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI S5, S5, 0x04
|
||||
PTR_ADDI S6, S6, 0x04
|
||||
PTR_ADDI S7, S7, 0x04
|
||||
PTR_ADDI S8, S8, 0x04
|
||||
PTR_ADDI P4, P4, 0x20
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_M8
|
||||
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
.L_M4:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S3, S1, T0
|
||||
PTR_ADD S4, S2, T0
|
||||
PTR_ADD S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x80
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M4_N7
|
||||
.align 5
|
||||
.L_M4_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI S3, S3, 0x20
|
||||
PTR_ADDI S4, S4, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M4_N8
|
||||
.L_M4_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M4_N3
|
||||
.L_M4_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI S3, S3, 0x10
|
||||
PTR_ADDI S4, S4, 0x10
|
||||
PTR_ADDI P2, P2, 0x40
|
||||
.L_M4_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M4_N1
|
||||
.L_M4_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI S3, S3, 0x08
|
||||
PTR_ADDI S4, S4, 0x08
|
||||
PTR_ADDI P3, P3, 0x20
|
||||
.L_M4_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI S3, S3, 0x04
|
||||
PTR_ADDI S4, S4, 0x04
|
||||
PTR_ADDI P4, P4, 0x10
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
.L_M2:
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
PTR_ADD S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x40
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M2_N7
|
||||
.align 5
|
||||
.L_M2_N8:
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00, U1, P1, 0x20
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
PTR_ADDI S2, S2, 0x20
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M2_N8
|
||||
.L_M2_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M2_N3
|
||||
.L_M2_N4:
|
||||
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
|
||||
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI S2, S2, 0x10
|
||||
PTR_ADDI P2, P2, 0x20
|
||||
.L_M2_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M2_N1
|
||||
.L_M2_N2:
|
||||
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI S2, S2, 0x08
|
||||
PTR_ADDI P3, P3, 0x10
|
||||
.L_M2_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
|
||||
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI S2, S2, 0x04
|
||||
PTR_ADDI P4, P4, 0x08
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
PTR_ADD S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
PTR_ADDI P0, P0, 0x20
|
||||
|
||||
PTR_SRAI I, N, 0x03
|
||||
beq ZERO, I, .L_M1_N7
|
||||
.align 5
|
||||
.L_M1_N8:
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
GST xv, , U0, P1, 0x00
|
||||
|
||||
PTR_ADDI S1, S1, 0x20
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD P1, P1, T1
|
||||
blt ZERO, I, .L_M1_N8
|
||||
.L_M1_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_M1_N3
|
||||
.L_M1_N4:
|
||||
GLD v, , $vr0, S1, 0x00
|
||||
GST v, , $vr0, P2, 0x00
|
||||
PTR_ADDI S1, S1, 0x10
|
||||
PTR_ADDI P2, P2, 0x10
|
||||
.L_M1_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1_N1
|
||||
.L_M1_N2:
|
||||
GLD f, d, $f0, S1, 0x00
|
||||
GST f, d, $f0, P3, 0x00
|
||||
PTR_ADDI S1, S1, 0x08
|
||||
PTR_ADDI P3, P3, 0x08
|
||||
.L_M1_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
GLD f, s, $f0, S1, 0x00
|
||||
GST f, s, $f0, P4, 0x00
|
||||
PTR_ADDI S1, S1, 0x04
|
||||
PTR_ADDI P4, P4, 0x04
|
||||
.L_M0:
|
||||
pop_if_used 23, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
EPILOGUE
|
|
@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
fmov.d s2, s1
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, BASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
beq $r0, INCX, .L999
|
||||
srai.d I, N, 3
|
||||
bne INCX, TEMP, .L20
|
||||
bge $r0, I, .L15
|
||||
|
|
|
@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
MTC s1, $r0
|
||||
bge $r0, N, .L999
|
||||
slli.d INCX, INCX, ZBASE_SHIFT
|
||||
bge $r0, INCX, .L999
|
||||
beq $r0, INCX, .L999
|
||||
move XX, X
|
||||
MOV s2, s1
|
||||
srai.d I, N, 2
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT absxi = 0.0;
|
||||
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
if ( n == 1 ) return( ABS(x[0]) );
|
||||
|
||||
n *= inc_x;
|
||||
|
|
|
@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG inc_x2;
|
||||
FLOAT temp;
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if (n <= 0 || inc_x == 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
|
|
|
@ -77,7 +77,7 @@
|
|||
blez N, .L999
|
||||
mov.d s2, s1
|
||||
|
||||
blez INCX, .L999
|
||||
beqz INCX, .L999
|
||||
dsll INCX, INCX, ZBASE_SHIFT
|
||||
|
||||
dsra I, N, 2
|
||||
|
|
|
@ -81,7 +81,7 @@
|
|||
blez N, .L999
|
||||
MTC $0, s1
|
||||
|
||||
blez INCX, .L999
|
||||
beqz INCX, .L999
|
||||
dsll INCX, INCX, BASE_SHIFT
|
||||
|
||||
move XX, X
|
||||
|
|
|
@ -77,7 +77,7 @@
|
|||
blez N, .L999
|
||||
mov.d s2, s1
|
||||
|
||||
blez INCX, .L999
|
||||
beqz INCX, .L999
|
||||
dsll INCX, INCX, BASE_SHIFT
|
||||
|
||||
bne INCX, TEMP, .L20
|
||||
|
|
|
@ -80,7 +80,7 @@
|
|||
blez N, .L999
|
||||
MTC $0, s1
|
||||
|
||||
blez INCX, .L999
|
||||
beqz INCX, .L999
|
||||
dsll INCX, INCX, ZBASE_SHIFT
|
||||
|
||||
move XX, X
|
||||
|
|
|
@ -99,7 +99,7 @@
|
|||
cmpwi cr0, N, 0
|
||||
ble- LL(9999)
|
||||
cmpwi cr0, INCX, 0
|
||||
ble- LL(9999)
|
||||
beq- LL(9999)
|
||||
|
||||
fmr f0, f1
|
||||
fmr f2, f1
|
||||
|
|
|
@ -119,7 +119,7 @@
|
|||
cmpwi cr0, N, 0
|
||||
ble LL(99)
|
||||
cmpwi cr0, INCX, 0
|
||||
ble LL(99)
|
||||
beq LL(99)
|
||||
|
||||
andi. r0, X, 2 * SIZE - 1
|
||||
bne LL(100)
|
||||
|
|
|
@ -104,7 +104,7 @@
|
|||
cmpwi cr0, N, 0
|
||||
ble- LL(999)
|
||||
cmpwi cr0, INCX, 0
|
||||
ble- LL(999)
|
||||
beq- LL(999)
|
||||
|
||||
fmr f0, f1
|
||||
sub X, X, INCX
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue