Merge branch 'xianyi:develop' into issue4130

This commit is contained in:
Martin Kroeker 2023-09-01 09:05:58 +02:00 committed by GitHub
commit 42909ce57d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
211 changed files with 8940 additions and 3131 deletions

View File

@ -30,6 +30,15 @@ task:
- cd build - cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make - make
task:
name: AppleM1/GCC/MAKE/OPENMP
compile_script:
- brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance: macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest image: ghcr.io/cirruslabs/macos-monterey-xcode:latest

View File

@ -151,40 +151,53 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
msystem: [MINGW64, MINGW32, CLANG64] msystem: [MINGW64, MINGW32, CLANG64, CLANG32]
idx: [int32, int64] idx: [int32, int64]
build-type: [Release] build-type: [Release]
include: include:
- msystem: MINGW64 - msystem: MINGW64
idx: int32 idx: int32
target-prefix: mingw-w64-x86_64 target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran fc-pkg: fc
- msystem: MINGW32 - msystem: MINGW32
idx: int32 idx: int32
target-prefix: mingw-w64-i686 target-prefix: mingw-w64-i686
fc-pkg: mingw-w64-i686-gcc-fortran fc-pkg: fc
- msystem: CLANG64 - msystem: CLANG64
idx: int32 idx: int32
target-prefix: mingw-w64-clang-x86_64 target-prefix: mingw-w64-clang-x86_64
fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: CLANG32
idx: int32
target-prefix: mingw-w64-clang-i686
fc-pkg: cc
c-lapack-flags: -DC_LAPACK=ON c-lapack-flags: -DC_LAPACK=ON
- msystem: MINGW64 - msystem: MINGW64
idx: int64 idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1 idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-x86_64 target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran fc-pkg: fc
- msystem: CLANG64 - msystem: CLANG64
idx: int64 idx: int64
idx64-flags: -DBINARY=64 -DINTERFACE64=1 idx64-flags: -DBINARY=64 -DINTERFACE64=1
target-prefix: mingw-w64-clang-x86_64 target-prefix: mingw-w64-clang-x86_64
c-lapack-flags: -DC_LAPACK=ON fc-pkg: fc
# Compiling with Flang 16 seems to cause test errors on machines
# with AVX512 instructions. Revisit after MSYS2 distributes Flang 17.
no-avx512-flags: -DNO_AVX512=1
- msystem: MINGW64 - msystem: MINGW64
idx: int32 idx: int32
target-prefix: mingw-w64-x86_64 target-prefix: mingw-w64-x86_64
fc-pkg: mingw-w64-x86_64-gcc-fortran fc-pkg: fc
build-type: None build-type: None
exclude: exclude:
- msystem: MINGW32 - msystem: MINGW32
idx: int64 idx: int64
- msystem: CLANG32
idx: int64
defaults: defaults:
run: run:
@ -209,7 +222,7 @@ jobs:
install: >- install: >-
base-devel base-devel
${{ matrix.target-prefix }}-cc ${{ matrix.target-prefix }}-cc
${{ matrix.fc-pkg }} ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }}
${{ matrix.target-prefix }}-cmake ${{ matrix.target-prefix }}-cmake
${{ matrix.target-prefix }}-ninja ${{ matrix.target-prefix }}-ninja
${{ matrix.target-prefix }}-ccache ${{ matrix.target-prefix }}-ccache
@ -261,6 +274,7 @@ jobs:
-DTARGET=CORE2 \ -DTARGET=CORE2 \
${{ matrix.idx64-flags }} \ ${{ matrix.idx64-flags }} \
${{ matrix.c-lapack-flags }} \ ${{ matrix.c-lapack-flags }} \
${{ matrix.no-avx512-flags }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
.. ..
@ -280,9 +294,22 @@ jobs:
key: ${{ steps.ccache-prepare.outputs.key }} key: ${{ steps.ccache-prepare.outputs.key }}
- name: Run tests - name: Run tests
id: run-ctest
timeout-minutes: 60 timeout-minutes: 60
run: cd build && ctest run: cd build && ctest
- name: Re-run tests
if: always() && (steps.run-ctest.outcome == 'failure')
timeout-minutes: 60
run: |
cd build
echo "::group::Re-run ctest"
ctest --rerun-failed --output-on-failure || true
echo "::endgroup::"
echo "::group::Log from these tests"
[ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log
echo "::endgroup::"
cross_build: cross_build:
runs-on: ubuntu-22.04 runs-on: ubuntu-22.04

110
.github/workflows/loongarch64.yml vendored Normal file
View File

@ -0,0 +1,110 @@
name: loongarch64 qemu test
on: [push, pull_request]
jobs:
TEST:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- target: LOONGSONGENERIC
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
- target: LOONGSON3R5
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON3R5
- target: LOONGSON2K1000
triple: loongarch64-unknown-linux-gnu
opts: NO_SHARED=1 TARGET=LOONGSON2K1000
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Install APT deps
run: |
sudo add-apt-repository ppa:savoury1/virtualisation
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
qemu-user-static
- name: Download and install loongarch64-toolchain
run: |
wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
- name: Set env
run: |
echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Disable utest dsdot:dsdot_n_1
run: |
echo -n > utest/test_dsdot.c
echo "Due to the qemu versions 7.2 causing utest cases to fail,"
echo "the utest dsdot:dsdot_n_1 have been temporarily disabled."
- name: Build OpenBLAS
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: Test
run: |
qemu-loongarch64-static ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat

2
.gitignore vendored
View File

@ -72,6 +72,7 @@ test/SBLAT3.SUMM
test/ZBLAT2.SUMM test/ZBLAT2.SUMM
test/ZBLAT3.SUMM test/ZBLAT3.SUMM
test/SHBLAT3.SUMM test/SHBLAT3.SUMM
test/SBBLAT3.SUMM
test/cblat1 test/cblat1
test/cblat2 test/cblat2
test/cblat3 test/cblat3
@ -82,6 +83,7 @@ test/sblat1
test/sblat2 test/sblat2
test/sblat3 test/sblat3
test/test_shgemm test/test_shgemm
test/test_sbgemm
test/zblat1 test/zblat1
test/zblat2 test/zblat2
test/zblat3 test/zblat3

2
Jenkinsfile vendored
View File

@ -7,7 +7,7 @@ pipeline {
stages { stages {
stage('Build') { stage('Build') {
steps { steps {
sh 'make' sh 'make clean && make'
} }
} }
} }

View File

@ -9,7 +9,7 @@ pipeline {
steps { steps {
sh 'sudo apt update' sh 'sudo apt update'
sh 'sudo apt install gfortran -y' sh 'sudo apt install gfortran -y'
sh 'make' sh 'make clean && make'
} }
} }
} }

View File

@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif endif
ifeq ($(C_COMPILER), CLANG)
CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12)
endif
# #
# OS dependent settings # OS dependent settings
# #
@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1
ifneq ($(NO_SVE), 1) ifneq ($(NO_SVE), 1)
DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEV1
DYNAMIC_CORE += NEOVERSEN2 DYNAMIC_CORE += NEOVERSEN2
DYNAMIC_CORE += ARMV8SVE
endif endif
DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += CORTEXA55
DYNAMIC_CORE += FALKOR DYNAMIC_CORE += FALKOR
@ -1086,8 +1092,9 @@ endif
endif endif
endif endif
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW))
CCOMMON_OPT += -DF_INTERFACE_GFORT CCOMMON_OPT += -DF_INTERFACE_GFORT
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -Wall FCOMMON_OPT += -Wall
# make single-threaded LAPACK calls thread-safe #1847 # make single-threaded LAPACK calls thread-safe #1847
FCOMMON_OPT += -frecursive FCOMMON_OPT += -frecursive
@ -1101,6 +1108,7 @@ EXTRALIB += -lgfortran
endif endif
endif endif
endif endif
endif
ifdef NO_BINARY_MODE ifdef NO_BINARY_MODE
ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifeq ($(ARCH), $(filter $(ARCH),mips64))
ifdef BINARY64 ifdef BINARY64
@ -1767,6 +1775,8 @@ export TARGET_CORE
export NO_AVX512 export NO_AVX512
export NO_AVX2 export NO_AVX2
export BUILD_BFLOAT16 export BUILD_BFLOAT16
export NO_LSX
export NO_LASX
export SBGEMM_UNROLL_M export SBGEMM_UNROLL_M
export SBGEMM_UNROLL_N export SBGEMM_UNROLL_N

View File

@ -75,18 +75,31 @@ endif
ifeq ($(CORE), COOPERLAKE) ifeq ($(CORE), COOPERLAKE)
ifndef NO_AVX512 ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1 # cooperlake support was added in 10.1
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake
endif endif
else # gcc not support, fallback to avx512 else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512 CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512
endif endif
endif endif
else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 9
ifeq ($(CLANGVERSIONGTEQ9), 1)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables CCOMMON_OPT += -fno-asynchronous-unwind-tables
@ -104,18 +117,31 @@ endif
ifeq ($(CORE), SAPPHIRERAPIDS) ifeq ($(CORE), SAPPHIRERAPIDS)
ifndef NO_AVX512 ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# sapphire rapids support was added in 11 # sapphire rapids support was added in 11
ifeq ($(GCCVERSIONGTEQ11), 1) ifeq ($(GCCVERSIONGTEQ11), 1)
CCOMMON_OPT += -march=sapphirerapids CCOMMON_OPT += -march=sapphirerapids
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=sapphirerapids FCOMMON_OPT += -march=sapphirerapids
endif endif
else # gcc not support, fallback to avx512 else # gcc not support, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512 CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG) ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512 FCOMMON_OPT += -march=skylake-avx512
endif endif
endif endif
else ifeq ($(C_COMPILER), CLANG)
# cooperlake support was added in clang 12
ifeq ($(CLANGVERSIONGTEQ12), 1)
CCOMMON_OPT += -march=cooperlake
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=cooperlake
endif
else # not supported in clang, fallback to avx512
CCOMMON_OPT += -march=skylake-avx512
ifneq ($(F_COMPILER), NAG)
FCOMMON_OPT += -march=skylake-avx512
endif
endif
endif endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
CCOMMON_OPT += -fno-asynchronous-unwind-tables CCOMMON_OPT += -fno-asynchronous-unwind-tables

View File

@ -271,6 +271,19 @@ jobs:
- script: | - script: |
make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: OSX_xbuild_DYNAMIC_ARM64
pool:
vmImage: 'macOS-11'
variables:
CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang
CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64
steps:
- script: |
ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs
/Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus
/Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version
make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1
- job: ALPINE_MUSL - job: ALPINE_MUSL
pool: pool:
vmImage: 'ubuntu-latest' vmImage: 'ubuntu-latest'

0
benchmark/spr.c Executable file → Normal file
View File

0
benchmark/spr2.c Executable file → Normal file
View File

38
c_check
View File

@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then
rm -rf "$tmpd" rm -rf "$tmpd"
fi fi
no_lsx=0
no_lasx=0
if [ "$architecture" = "loongarch64" ]; then
tmpd="$(mktemp -d)"
tmplsx="$tmpd/lsx.c"
codelsx='"vadd.b $vr0, $vr0, $vr0"'
lsx_flags='-march=loongarch64 -mlsx'
printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
args="$lsx_flags -o $tmplsx.o $tmplsx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lsx=1
}
tmplasx="$tmpd/lasx.c"
codelasx='"xvadd.b $xr0, $xr0, $xr0"'
lasx_flags='-march=loongarch64 -mlasx'
printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
args="$lasx_flags -o $tmplasx.o $tmplasx"
{
$compiler_name $flags $args >/dev/null 2>&1
} || {
no_lasx=1
}
rm -rf "$tmpd"
fi
case "$data" in case "$data" in
*ARCH_X86_64*) architecture=x86_64 ;; *ARCH_X86_64*) architecture=x86_64 ;;
*ARCH_X86*) architecture=x86 ;; *ARCH_X86*) architecture=x86 ;;
@ -252,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then
no_sve=0 no_sve=0
{ {
$compiler_name $flags $args >/dev/null 2>&1 $compiler_name $flags $args >/dev/null 2>&1
} || {
args=" -Msve_intrinsics -c -o $tmpf.o $tmpf"
$compiler_name $flags $args >/dev/null 2>&1
} || { } || {
no_sve=1 no_sve=1
} }
@ -399,6 +433,8 @@ done
[ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
[ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
[ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
[ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
[ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n"
} >> "$makefile" } >> "$makefile"
os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ `
@ -414,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' `
[ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu"
[ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n"
[ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n"
[ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n"
[ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n"
} >> "$config" } >> "$config"

View File

@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
} }
} }
$no_lsx = 0;
$no_lasx = 0;
if (($architecture eq "loongarch64")) {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility";
} else {
$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
$lsx_flags = "-march=loongarch64 -mlsx";
print $tmplsx "#include <lsxintrin.h>\n\n";
print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
$args = "$lsx_flags -o $tmplsx.o $tmplsx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lsx = 1;
} else {
$no_lsx = 0;
}
unlink("$tmplsx.o");
$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
$lasx_flags = "-march=loongarch64 -mlasx";
print $tmplasx "#include <lasxintrin.h>\n\n";
print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
$args = "$lasx_flags -o $tmplasx.o $tmplasx";
my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_lasx = 1;
} else {
$no_lasx = 0;
}
unlink("$tmplasx.o");
}
}
$architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86 if ($data =~ /ARCH_X86/);
$architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/);
$architecture = e2k if ($data =~ /ARCH_E2K/); $architecture = e2k if ($data =~ /ARCH_E2K/);
@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1;
print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1;
print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1;
$os =~ tr/[a-z]/[A-Z]/; $os =~ tr/[a-z]/[A-Z]/;
$architecture =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/;
@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64;
print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne "";
print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1;
print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1;
print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1;
print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1;
if ($os eq "LINUX") { if ($os eq "LINUX") {

View File

@ -46,7 +46,7 @@ if (DYNAMIC_ARCH)
if (ARM64) if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
endif () endif ()
if (DYNAMIC_LIST) if (DYNAMIC_LIST)
set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST})
@ -135,7 +135,7 @@ if (ARM64)
set(BINARY_DEFINED 1) set(BINARY_DEFINED 1)
endif () endif ()
if (${ARCH} STREQUAL "riscv64") if (RISCV64)
set(NO_BINARY_MODE 1) set(NO_BINARY_MODE 1)
set(BINARY_DEFINED 1) set(BINARY_DEFINED 1)
endif () endif ()

View File

@ -180,22 +180,30 @@ endif ()
if (${CORE} STREQUAL NEOVERSEN2) if (${CORE} STREQUAL NEOVERSEN2)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else () else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
endif() if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif ()
endif () endif ()
endif () endif ()
if (${CORE} STREQUAL NEOVERSEV1) if (${CORE} STREQUAL NEOVERSEV1)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
else () else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve")
endif()
endif() endif()
endif () endif ()
endif () endif ()
@ -213,7 +221,11 @@ endif ()
if (${CORE} STREQUAL ARMV8SVE) if (${CORE} STREQUAL ARMV8SVE)
if (NOT DYNAMIC_ARCH) if (NOT DYNAMIC_ARCH)
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve")
else ()
set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve")
endif ()
endif () endif ()
endif () endif ()

View File

@ -3,7 +3,8 @@
## Description: Ported from portion of OpenBLAS/Makefile.system ## Description: Ported from portion of OpenBLAS/Makefile.system
## Sets Fortran related variables. ## Sets Fortran related variables.
if (${F_COMPILER} STREQUAL "FLANG") if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64) if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8") set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95")
endif () endif ()
endif () endif ()
if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT")
# ensure reentrancy of lapack codes if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") # ensure reentrancy of lapack codes
# work around ABI violation in passing string arguments from C set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive")
set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") # work around ABI violation in passing string arguments from C
#Don't include -lgfortran, when NO_LAPACK=1 or lsbcc set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls")
if (NOT NO_LAPACK) if (NOT NO_LAPACK)
set(EXTRALIB "${EXTRALIB} -lgfortran") # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc
set(EXTRALIB "${EXTRALIB} -lgfortran")
endif ()
endif () endif ()
if (NO_BINARY_MODE) if (NO_BINARY_MODE)
if (MIPS64) if (MIPS64)
@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95")
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
endif () endif ()
endif () endif ()
if (RISCV64)
if (BINARY64)
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
endif ()
endif ()
else () else ()
if (BINARY64) if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") set(FCOMMON_OPT "${FCOMMON_OPT} -m64")

View File

@ -282,23 +282,35 @@ if (DEFINED TARGET)
endif() endif()
if (${TARGET} STREQUAL NEOVERSEV1) if (${TARGET} STREQUAL NEOVERSEV1)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1")
else ()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1")
else () else ()
message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.")
endif() endif()
endif()
endif() endif()
if (${TARGET} STREQUAL NEOVERSEN2) if (${TARGET} STREQUAL NEOVERSEN2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else ()
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2")
else () else ()
message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.")
endif() endif()
endif()
endif() endif()
if (${TARGET} STREQUAL ARMV8SVE) if (${TARGET} STREQUAL ARMV8SVE)
if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve")
else ()
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve")
endif()
endif() endif()
endif() endif()

View File

@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1) set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*")
set(LOONGARCH64 1) set(LOONGARCH64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*")
set(RISCV64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
if (NOT BINARY) if (NOT BINARY)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
endif() endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1) set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)")
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(ARM64 1) set(ARM64 1)
else() else()
@ -107,7 +109,7 @@ else()
endif () endif ()
if (NOT BINARY) if (NOT BINARY)
if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
set(BINARY 64) set(BINARY 64)
else () else ()
set(BINARY 32) set(BINARY 32)

View File

@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads);
/* Global Parameter */ /* Global Parameter */
extern int blas_cpu_number; extern int blas_cpu_number;
extern int blas_num_threads; extern int blas_num_threads;
extern int blas_num_threads_set;
extern int blas_omp_linked; extern int blas_omp_linked;
#define BLAS_LEGACY 0x8000U #define BLAS_LEGACY 0x8000U
@ -136,15 +135,13 @@ typedef struct blas_queue {
#ifdef SMP_SERVER #ifdef SMP_SERVER
extern int blas_server_avail; extern int blas_server_avail;
extern int blas_omp_number_max;
static __inline int num_cpu_avail(int level) { static __inline int num_cpu_avail(int level) {
#ifdef USE_OPENMP #ifdef USE_OPENMP
int openmp_nthreads; int openmp_nthreads;
if (blas_num_threads_set == 0)
openmp_nthreads=omp_get_max_threads(); openmp_nthreads=omp_get_max_threads();
else
openmp_nthreads=blas_cpu_number;
#endif #endif
#ifndef USE_OPENMP #ifndef USE_OPENMP
@ -156,7 +153,13 @@ int openmp_nthreads;
) return 1; ) return 1;
#ifdef USE_OPENMP #ifdef USE_OPENMP
if (blas_cpu_number != openmp_nthreads) { if (openmp_nthreads > blas_omp_number_max){
#ifdef DEBUG
fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max);
#endif
openmp_nthreads = blas_omp_number_max;
}
if (blas_cpu_number != openmp_nthreads) {
goto_set_num_threads(openmp_nthreads); goto_set_num_threads(openmp_nthreads);
} }
#endif #endif

View File

@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
#include <stdint.h> #include <stdint.h>
#include <sys/auxv.h>
/* If LASX extension instructions supported, /* If LASX extension instructions supported,
* using core LOONGSON3R5 * using core LOONGSON3R5
@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CPU_LOONGSON3R5 1 #define CPU_LOONGSON3R5 1
#define CPU_LOONGSON2K1000 2 #define CPU_LOONGSON2K1000 2
#define LOONGARCH_CFG2 0x02 #define LA_HWCAP_LSX (1<<4)
#define LOONGARCH_LASX 1<<7 #define LA_HWCAP_LASX (1<<5)
#define LOONGARCH_LSX 1<<6
static char *cpuname[] = { static char *cpuname[] = {
"LOONGSONGENERIC", "LOONGSONGENERIC",
@ -64,17 +64,11 @@ static char *cpuname_lower[] = {
int detect(void) { int detect(void) {
#ifdef __linux #ifdef __linux
uint32_t reg = 0; int flag = (int)getauxval(AT_HWCAP);
__asm__ volatile ( if (flag & LA_HWCAP_LASX)
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5; return CPU_LOONGSON3R5;
else if (reg & LOONGARCH_LSX) else if (flag & LA_HWCAP_LSX)
return CPU_LOONGSON2K1000; return CPU_LOONGSON2K1000;
else else
return CPU_GENERIC; return CPU_GENERIC;

View File

@ -1551,6 +1551,7 @@ int get_cpuname(void){
case 7: // Raptor Lake case 7: // Raptor Lake
case 10: case 10:
case 15: case 15:
case 14: // Alder Lake N
if(support_avx2()) if(support_avx2())
return CPUTYPE_HASWELL; return CPUTYPE_HASWELL;
if(support_avx()) if(support_avx())
@ -2360,6 +2361,7 @@ int get_coretype(void){
case 7: // Raptor Lake case 7: // Raptor Lake
case 10: case 10:
case 15: case 15:
case 14: // Alder Lake N
#ifndef NO_AVX2 #ifndef NO_AVX2
if(support_avx2()) if(support_avx2())
return CORE_HASWELL; return CORE_HASWELL;

View File

@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(F_COMPILER), GFORTRAN)
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
CEXTRALIB = -lomp CEXTRALIB += -lomp
endif endif
endif endif
ifeq ($(F_COMPILER), NAG) ifeq ($(F_COMPILER), NAG)

270
docs/distributing.md Normal file
View File

@ -0,0 +1,270 @@
# Guidance for redistributing OpenBLAS
*We note that this document contains recommendations only - packagers and other
redistributors are in charge of how OpenBLAS is built and distributed in their
systems, and may have good reasons to deviate from the guidance given on this
page. These recommendations are aimed at general packaging systems, with a user
base that typically is large, open source (or freely available at least), and
doesn't behave uniformly or that the packager is directly connected with.*
OpenBLAS has a large number of build-time options which can be used to change
how it behaves at runtime, how artifacts or symbols are named, etc. Variation
in build configuration can be necessary to acheive a given end goal within a
distribution or as an end user. However, such variation can also make it more
difficult to build on top of OpenBLAS and ship code or other packages in a way
that works across many different distros. Here we provide guidance about the
most important build options, what effects they may have when changed, and
which ones to default to.
The Make and CMake build systems provide equivalent options and yield more or
less the same artifacts, but not exactly (the CMake builds are still
experimental). You can choose either one and the options will function in the
same way, however the CMake outputs may require some renaming. To review
available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of
the repository.
Build options typically fall into two categories: (a) options that affect the
user interface, such as library and symbol names or APIs that are made
available, and (b) options that affect performance and runtime behavior, such
as threading behavior or CPU architecture-specific code paths. The user
interface options are more important to keep aligned between distributions,
while for the performance-related options there are typically more reasons to
make choices that deviate from the defaults.
Here are recommendations for user interface related packaging choices where it
is not likely to be a good idea to deviate (typically these are the default
settings):
1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect
binary size much, so don't turn it off.
2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and
while it does make up a significant part of the binary size of the installed
library, that does not outweigh the regression in usability when deviating
from the default here.[^1]
3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency
detection files. These files are used by build systems when users want to
link against OpenBLAS, and there is no benefit of leaving them out.
4. Provide the LP64 interface by default, and if in addition to that you choose
to provide an ILP64 interface build as well, use a symbol suffix to avoid
symbol name clashes (see the next section).
[^1] All major distributions do include LAPACK as of mid 2023 as far as we
know. Older versions of Arch Linux did not, and that was known to cause
problems.
## ILP64 interface builds
The LP64 (32-bit integer) interface is the default build, and has
well-established C and Fortran APIs as determined by the reference (Netlib)
BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does
not have a standard API: symbol names and shared/static library names can be
produced in multiple ways, and this tends to make it difficult to use.
As of today there is an agreed-upon way of choosing names for OpenBLAS between
a number of key users/redistributors, which is the closest thing to a standard
that there is now. However, there is an ongoing standardization effort in the
reference BLAS and LAPACK libraries, which differs from the current OpenBLAS
agreed-upon convention. In this section we'll aim to explain both.
Those two methods are fairly similar, and have a key thing in common: *using a
symbol suffix*. This is good practice; it is recommended that if you distribute
an ILP64 build, to have it use a symbol suffix containing `64` in the name.
This avoids potential symbol clashes when different packages which depend on
OpenBLAS load both an LP64 and an ILP64 library into memory at the same time.
### The current OpenBLAS agreed-upon ILP64 convention
This convention comprises the shared library name and the symbol suffix in the
shared library. The symbol suffix to use is `64_`, implying that the library
name will be `libopenblas64_.so` and the symbols in that library end in `64_`.
The central issue where this was discussed is
[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters
include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well.
To build shared and static libraries with the currently recommended ILP64
conventions with Make:
```bash
$ make INTERFACE64=1 SYMBOLSUFFIX=64_
```
This will produce libraries named `libopenblas64_.so|a`, a pkg-config file
named `openblas64.pc`, and CMake and header files.
Installing locally and inspecting the output will show a few more details:
```bash
$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_
$ tree . # output slightly edited down
.
├── include
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── openblas
│   ├── OpenBLASConfig.cmake
│   └── OpenBLASConfigVersion.cmake
├── libopenblas64_.a
├── libopenblas64_.so
└── pkgconfig
└── openblas64.pc
```
A key point are the symbol names. These will equal the LP64 symbol names, then
(for Fortran only) the compiler mangling, and then the `64_` symbol suffix.
Hence to obtain the final symbol names, we need to take into account which
Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel
Fortran, or Flang), that means appending a single underscore. In that case, the
result is:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` |
It is quite useful to have these symbol names be as uniform as possible across
different packaging systems.
The equivalent build options with CMake are:
```bash
$ mkdir build && cd build
$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON
$ cmake --build . -j
```
Note that the result is not 100% identical to the Make result. For example, the
library name ends in `_64` rather than `64_` - it is recommended to rename them
to match the Make library names (also update the `libsuffix` entry in
`openblas64.pc` to match that rename).
```bash
$ cmake --install . --prefix $PWD/../../openblas/cmake64
$ tree .
.
├── include
│   └── openblas64
│   ├── cblas.h
│   ├── f77blas.h
│   ├── lapacke_config.h
│   ├── lapacke_example_aux.h
│   ├── lapacke.h
│   ├── lapacke_mangling.h
│   ├── lapacke_utils.h
│   ├── lapack.h
│   ├── openblas64
│   │   └── lapacke_mangling.h
│   └── openblas_config.h
└── lib
├── cmake
│   └── OpenBLAS64
│   ├── OpenBLAS64Config.cmake
│   ├── OpenBLAS64ConfigVersion.cmake
│   ├── OpenBLAS64Targets.cmake
│   └── OpenBLAS64Targets-noconfig.cmake
├── libopenblas_64.a
├── libopenblas_64.so -> libopenblas_64.so.0
└── pkgconfig
└── openblas64.pc
```
### The upcoming standardized ILP64 convention
While the `64_` convention above got some adoption, it's slightly hacky and is
implemented through the use of `objcopy`. An effort is ongoing for a more
broadly adopted convention in the reference BLAS and LAPACK libraries, using
(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after
Fortran compiler mangling. The central issue for this is
[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666).
For the most common cases of compiler mangling (a single `_` appended), the end
result will be:
| base API name | binary symbol name | call from Fortran code | call from C code |
|---------------|--------------------|------------------------|-----------------------|
| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` |
| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` |
For other compiler mangling schemes, replace the trailing `_` by the scheme in use.
The shared library name for this `_64` convention should be `libopenblas_64.so`.
Note: it is not yet possible to produce an OpenBLAS build which employs this
convention! Once reference BLAS and LAPACK with support for `_64` have been
released, a future OpenBLAS release will support it. For now, please use the
older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be
considered reserved for future use of the `_64` standard as prescribed by
reference BLAS/LAPACK.
## Performance and runtime behavior related build options
For these options there are multiple reasonable or common choices.
### Threading related options
OpenBLAS can be built as a multi-threaded or single-threaded library, with the
default being multi-threaded. It's expected that the default `libopenblas`
library is multi-threaded; if you'd like to also distribute single-threaded
builds, consider naming them `libopenblas_sequential`.
OpenBLAS can be built with pthreads or OpenMP as the threading model, with the
default being pthreads. Both options are commonly used, and the choice here
should not influence the shared library name. The choice will be captured by
the `.pc` file. E.g.,:
```bash
$ pkg-config --libs openblas
-fopenmp -lopenblas
$ cat openblas.pc
...
openblas_config= ... USE_OPENMP=0 MAX_THREADS=24
```
The maximum number of threads users will be able to use is determined at build
time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide
range of values that are reasonable to use (up to 256). 64 is a typical choice
here; there is a memory footprint penalty that is linear in `NUM_THREADS`.
Please see `Makefile.rule` for more details.
### CPU architecture related options
OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when
distributing to a user base with a variety of hardware, it is recommended to
enable CPU architecture runtime detection. This will dynamically select
optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1`
build option. This is usually done on all common CPU families, except when
there are known issues.
In case the CPU architecture is known (e.g. you're building binaries for macOS
M1 users), it is possible to specify the target architecture directly with the
`TARGET=` build option.
`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md`
in this repository.
## Real-world examples
OpenBLAS is likely to be distributed in one of these distribution models:
1. As a standalone package, or multiple packages, in a packaging ecosystem like
a Linux distro, Homebrew, conda-forge or MSYS2.
2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R.
3. Locally, e.g. making available as a build on a single HPC cluster.
The guidance on this page is most important for models (1) and (2). These links
to build recipes for a representative selection of packaging systems may be
helpful as a reference:
- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec)
- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules)
- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb)
- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD)
- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh)
- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh)
- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix)

View File

@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) {
increased_threads = 1; increased_threads = 1;
for(i = blas_num_threads - 1; i < num_threads - 1; i++){ for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0);
thread_status[i].status = THREAD_STATUS_WAKEUP; thread_status[i].status = THREAD_STATUS_WAKEUP;

View File

@ -68,6 +68,7 @@
#endif #endif
int blas_server_avail = 0; int blas_server_avail = 0;
int blas_omp_number_max = 0;
extern int openblas_omp_adaptive_env(); extern int openblas_omp_adaptive_env();
@ -100,8 +101,6 @@ static void adjust_thread_buffers() {
void goto_set_num_threads(int num_threads) { void goto_set_num_threads(int num_threads) {
blas_num_threads_set = 1;
if (num_threads < 0) blas_num_threads_set = 0;
if (num_threads < 1) num_threads = blas_num_threads; if (num_threads < 1) num_threads = blas_num_threads;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) {
} }
int blas_thread_init(void){ int blas_thread_init(void){
if(blas_omp_number_max <= 0)
blas_omp_number_max = omp_get_max_threads();
blas_get_cpu_number(); blas_get_cpu_number();

View File

@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads)
blas_server_avail = 1; blas_server_avail = 1;
} }
for(i = blas_num_threads - 1; i < num_threads - 1; i++){ for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0, blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i, blas_thread_server, (void *)i,

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2;
#else #else
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#endif #endif
#ifdef DYN_ARMV8SVE
extern gotoblas_t gotoblas_ARMV8SVE;
#else
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif
#ifdef DYN_CORTEX_A55 #ifdef DYN_CORTEX_A55
extern gotoblas_t gotoblas_CORTEXA55; extern gotoblas_t gotoblas_CORTEXA55;
#else #else
@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1;
#ifndef NO_SVE #ifndef NO_SVE
extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEV1;
extern gotoblas_t gotoblas_NEOVERSEN2; extern gotoblas_t gotoblas_NEOVERSEN2;
extern gotoblas_t gotoblas_ARMV8SVE;
#else #else
#define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEV1 gotoblas_ARMV8
#define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8
#define gotoblas_ARMV8SVE gotoblas_ARMV8
#endif #endif
extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_THUNDERX3T110;
extern gotoblas_t gotoblas_CORTEXA55; extern gotoblas_t gotoblas_CORTEXA55;
#endif #endif
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"
#define NUM_CORETYPES 13 #define NUM_CORETYPES 16
/* /*
* In case asm/hwcap.h is outdated on the build system, make sure * In case asm/hwcap.h is outdated on the build system, make sure
@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg);
#ifndef HWCAP_CPUID #ifndef HWCAP_CPUID
#define HWCAP_CPUID (1 << 11) #define HWCAP_CPUID (1 << 11)
#endif #endif
#ifndef HWCAP_SVE
#define HWCAP_SVE (1 << 22)
#endif
#define get_cpu_ftr(id, var) ({ \ #define get_cpu_ftr(id, var) ({ \
__asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \
@ -168,6 +181,7 @@ static char *corename[] = {
"neoversen2", "neoversen2",
"thunderx3t110", "thunderx3t110",
"cortexa55", "cortexa55",
"armv8sve",
"unknown" "unknown"
}; };
@ -187,6 +201,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12];
if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14];
if (gotoblas == &gotoblas_ARMV8SVE) return corename[15];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) {
case 12: return (&gotoblas_NEOVERSEN2); case 12: return (&gotoblas_NEOVERSEN2);
case 13: return (&gotoblas_THUNDERX3T110); case 13: return (&gotoblas_THUNDERX3T110);
case 14: return (&gotoblas_CORTEXA55); case 14: return (&gotoblas_CORTEXA55);
case 15: return (&gotoblas_ARMV8SVE);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_NEOVERSEN1; return &gotoblas_NEOVERSEN1;
#ifndef NO_SVE #ifndef NO_SVE
case 0xd49: case 0xd49:
return &gotoblas_NEOVERSEN2; if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
} else
return &gotoblas_NEOVERSEN2;
case 0xd40: case 0xd40:
return &gotoblas_NEOVERSEV1; if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
return &gotoblas_NEOVERSEN1;
}else
return &gotoblas_NEOVERSEV1;
#endif #endif
case 0xd05: // Cortex A55 case 0xd05: // Cortex A55
return &gotoblas_CORTEXA55; return &gotoblas_CORTEXA55;
@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) {
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg); openblas_warning(1, coremsg);
} }
#ifndef NO_SVE
if ((getauxval(AT_HWCAP) & HWCAP_SVE)) {
return &gotoblas_ARMV8SVE;
}
#endif
return NULL; return NULL;
#endif #endif
} }

View File

@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/ */
int blas_num_threads = 0; int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) { int goto_get_num_procs (void) {
return blas_cpu_number; return blas_cpu_number;
} }
@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s
*/ */
int blas_num_threads = 0; int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) { int goto_get_num_procs (void) {
return blas_cpu_number; return blas_cpu_number;
} }

View File

@ -283,7 +283,6 @@ The numbers of threads in the thread pool.
This value is equal or large than blas_cpu_number. This means some threads are sleep. This value is equal or large than blas_cpu_number. This means some threads are sleep.
*/ */
int blas_num_threads = 0; int blas_num_threads = 0;
int blas_num_threads_set = 0;
int goto_get_num_procs (void) { int goto_get_num_procs (void) {
return blas_cpu_number; return blas_cpu_number;

View File

@ -101,7 +101,14 @@ else
*flang*) *flang*)
vendor=FLANG vendor=FLANG
openmp='-fopenmp' openmp='-fopenmp'
;; data=`$compiler -v 2>&1 > /dev/null `
v="${data#*version *}"
v="${v%%*.}"
major="${v%%.*}"
if [ "$major" -ge 17 ]; then
vendor=FLANGNEW
fi
;;
*ifort*|*ifx*) *ifort*|*ifx*)
vendor=INTEL vendor=INTEL
openmp='-fopenmp' openmp='-fopenmp'

View File

@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA,
info = 0; info = 0;
if (lda < MAX(1, m)) info = 6; if (lda < MAX(1, m)) info = 5;
if (ldc < MAX(1, m)) info = 8; if (ldc < MAX(1, m)) info = 8;
if (n < 0) info = 2; if (n < 0) info = 2;

View File

@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){
if (n <= 0) return 0.; if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
if (n <= 0) return 0.; if (n <= 0) return 0.;
#ifndef COMPLEX
if (n == 1)
#ifdef DOUBLE
return fabs(x[0]);
#else
return fabsf(x[0]);
#endif
#endif
if (incx < 0)
#ifdef COMPLEX
x -= (n - 1) * incx * 2;
#else
x -= (n - 1) * incx;
#endif
IDEBUG_START; IDEBUG_START;
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();

View File

@ -33,7 +33,7 @@ endif
ifdef TARGET_CORE ifdef TARGET_CORE
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ11), 1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12)))
override CFLAGS += -march=sapphirerapids override CFLAGS += -march=sapphirerapids
else else
override CFLAGS += -march=skylake-avx512 -mavx512f override CFLAGS += -march=skylake-avx512 -mavx512f
@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
endif endif
else ifeq ($(TARGET_CORE), COOPERLAKE) else ifeq ($(TARGET_CORE), COOPERLAKE)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
ifeq ($(GCCVERSIONGTEQ10), 1) ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9)))
override CFLAGS += -march=cooperlake override CFLAGS += -march=cooperlake
else else
override CFLAGS += -march=skylake-avx512 -mavx512f override CFLAGS += -march=skylake-avx512 -mavx512f
@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), LOONGSON3R4) else ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),)
ifeq ($(C_COMPILER), PGI)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif
else else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif endif

View File

@ -35,6 +35,12 @@ USE_TRMM = 1
endif endif
endif endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), MIPS64_GENERIC)
USE_TRMM = 1
endif
endif
ifeq ($(CORE), HASWELL) ifeq ($(CORE), HASWELL)
USE_TRMM = 1 USE_TRMM = 1
endif endif

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0; FLOAT absxi = 0.0;
if (n <= 0 || inc_x <= 0) return(0.0); if (n <= 0 || inc_x == 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) ); if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x; n *= inc_x;

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2; BLASLONG inc_x2;
FLOAT temp; FLOAT temp;
if (n <= 0 || inc_x <= 0) return(0.0); if (n <= 0 || inc_x == 0) return(0.0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;

View File

@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S
SAXPYKERNEL = axpy.S SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S DAXPYKERNEL = daxpy_thunderx2t99.S
CAXPYKERNEL = zaxpy.S CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S
@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SASUMKERNEL = asum.S SCOPYKERNEL = copy_thunderx2t99.c
DASUMKERNEL = asum.S DCOPYKERNEL = copy_thunderx2t99.c
CASUMKERNEL = casum.S CCOPYKERNEL = copy_thunderx2t99.c
ZASUMKERNEL = zasum.S ZCOPYKERNEL = copy_thunderx2t99.c
SCOPYKERNEL = copy.S SSWAPKERNEL = swap_thunderx2t99.S
DCOPYKERNEL = copy.S DSWAPKERNEL = swap_thunderx2t99.S
CCOPYKERNEL = copy.S CSWAPKERNEL = swap_thunderx2t99.S
ZCOPYKERNEL = copy.S ZSWAPKERNEL = swap_thunderx2t99.S
SSWAPKERNEL = swap.S ISAMAXKERNEL = iamax_thunderx2t99.c
DSWAPKERNEL = swap.S IDAMAXKERNEL = iamax_thunderx2t99.c
CSWAPKERNEL = swap.S ICAMAXKERNEL = izamax_thunderx2t99.c
ZSWAPKERNEL = swap.S IZAMAXKERNEL = izamax_thunderx2t99.c
ISAMAXKERNEL = iamax.S SNRM2KERNEL = scnrm2_thunderx2t99.c
IDAMAXKERNEL = iamax.S DNRM2KERNEL = dznrm2_thunderx2t99.c
ICAMAXKERNEL = izamax.S CNRM2KERNEL = scnrm2_thunderx2t99.c
IZAMAXKERNEL = izamax.S ZNRM2KERNEL = dznrm2_thunderx2t99.c
SNRM2KERNEL = nrm2.S DDOTKERNEL = dot.c
DNRM2KERNEL = nrm2.S SDOTKERNEL = dot.c
CNRM2KERNEL = znrm2.S CDOTKERNEL = zdot_thunderx2t99.c
ZNRM2KERNEL = znrm2.S ZDOTKERNEL = zdot_thunderx2t99.c
DDOTKERNEL = dot.S
ifneq ($(C_COMPILER), PGI)
SDOTKERNEL = ../generic/dot.c
else
SDOTKERNEL = dot.S
endif
ifneq ($(C_COMPILER), PGI)
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
else
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
endif
DSDOTKERNEL = dot.S DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S DGEMM_BETA = dgemm_beta.S
@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c
CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
CGEMMINCOPY = cgemm_ncopy_sve_v1.c CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMITCOPY = cgemm_tcopy_sve_v1.c CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c
ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
ZGEMMINCOPY = zgemm_ncopy_sve_v1.c ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMITCOPY = zgemm_tcopy_sve_v1.c ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c

View File

@ -1,98 +1 @@
include $(KERNELDIR)/KERNEL.ARMV8SVE include $(KERNELDIR)/KERNEL.ARMV8SVE
DAXPYKERNEL = daxpy_thunderx2t99.S
SASUMKERNEL = sasum_thunderx2t99.c
DASUMKERNEL = dasum_thunderx2t99.c
CASUMKERNEL = casum_thunderx2t99.c
ZASUMKERNEL = zasum_thunderx2t99.c
SCOPYKERNEL = copy_thunderx2t99.c
DCOPYKERNEL = copy_thunderx2t99.c
CCOPYKERNEL = copy_thunderx2t99.c
ZCOPYKERNEL = copy_thunderx2t99.c
SSWAPKERNEL = swap_thunderx2t99.S
DSWAPKERNEL = swap_thunderx2t99.S
CSWAPKERNEL = swap_thunderx2t99.S
ZSWAPKERNEL = swap_thunderx2t99.S
ISAMAXKERNEL = iamax_thunderx2t99.c
IDAMAXKERNEL = iamax_thunderx2t99.c
ICAMAXKERNEL = izamax_thunderx2t99.c
IZAMAXKERNEL = izamax_thunderx2t99.c
SNRM2KERNEL = scnrm2_thunderx2t99.c
DNRM2KERNEL = dznrm2_thunderx2t99.c
CNRM2KERNEL = scnrm2_thunderx2t99.c
ZNRM2KERNEL = dznrm2_thunderx2t99.c
DDOTKERNEL = dot.c
SDOTKERNEL = dot.c
CDOTKERNEL = zdot_thunderx2t99.c
ZDOTKERNEL = zdot_thunderx2t99.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRMMUNCOPY_M =
CTRMMLNCOPY_M =
CTRMMUTCOPY_M =
CTRMMLTCOPY_M =
CHEMMLTCOPY_M =
CHEMMUTCOPY_M =
CSYMMUCOPY_M =
CSYMMLCOPY_M =
CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
else
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMCOPYLN_M =
ZTRSMCOPYLT_M =
ZTRSMCOPYUN_M =
ZTRSMCOPYUT_M =
ZTRMMUNCOPY_M =
ZTRMMLNCOPY_M =
ZTRMMUTCOPY_M =
ZTRMMLTCOPY_M =
ZHEMMLTCOPY_M =
ZHEMMUTCOPY_M =
ZSYMMUCOPY_M =
ZSYMMLCOPY_M =
ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
else
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)

View File

@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pB, pB, 32 add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_M1 .macro KERNELv1x4_M1
@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rw z15.s, p0/z, [pB, 28] ld1rw z15.s, p0/z, [pB, 28]
add pB, pB, 32 add pB, pB, 32
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_M2 .macro KERNELv1x4_M2
@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri z23.s, p1/m, z2.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s
ld1rw z15.s, p0/z, [pB, 28] ld1rw z15.s, p0/z, [pB, 28]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, 32 add pB, pB, 32
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_E .macro KERNELv1x4_E
@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z3.s, z15.s OP_ii z22.s, p1/m, z3.s, z15.s
OP_ri z23.s, p1/m, z2.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_SUB .macro KERNELv1x4_SUB
@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ii z22.s, p1/m, z1.s, z15.s OP_ii z22.s, p1/m, z1.s, z15.s
OP_ri z23.s, p1/m, z0.s, z15.s OP_ri z23.s, p1/m, z0.s, z15.s
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm .endm
.macro SAVEv1x4 .macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0] ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I fmls z24.s, p1/m, z17.s, alphaz_I
@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1] st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3 add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2w {z28.s, z29.s}, p1/z, [pCRow2] ld2w {z28.s, z29.s}, p1/z, [pCRow2]
fmla z28.s, p1/m, z20.s, alphaz_R fmla z28.s, p1/m, z20.s, alphaz_R
fmls z28.s, p1/m, z21.s, alphaz_I fmls z28.s, p1/m, z21.s, alphaz_I
@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z31.s, p1/m, z23.s, alphaz_R fmla z31.s, p1/m, z23.s, alphaz_R
st2w {z30.s, z31.s}, p1, [pCRow3] st2w {z30.s, z31.s}, p1, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVEv1x2 .macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0] ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I fmls z24.s, p1/m, z17.s, alphaz_I
@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2w {z26.s, z27.s}, p1, [pCRow1] st2w {z26.s, z27.s}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #3 add pCRow1, pCRow1, lanes, lsl #3
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVEv1x1 .macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2w {z24.s, z25.s}, p1/z, [pCRow0] ld2w {z24.s, z25.s}, p1/z, [pCRow0]
fmla z24.s, p1/m, z16.s, alphaz_R fmla z24.s, p1/m, z16.s, alphaz_R
fmls z24.s, p1/m, z17.s, alphaz_I fmls z24.s, p1/m, z17.s, alphaz_I
@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, s0 fmov alphaR, s0
dup alphaz_R, alphaR dup alphaz_R, alphaR
fmov alphaI, s1 fmov alphaI, s1
@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne .Lcgemm_kernel_L4_Mv1_46 bne .Lcgemm_kernel_L4_Mv1_46
.Lcgemm_kernel_L4_Mv1_100: .Lcgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4 SAVEv1x4
.Lcgemm_kernel_L4_Mv1_END: .Lcgemm_kernel_L4_Mv1_END:

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b32(j, n); svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg); uint32_t active = svcntp_b32(svptrue_b32(), pg);
do { do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2; aoffset += active * lda * 2;
j += svcntw(); j += svcntw();
pg = svwhilelt_b32(j, n); pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b32(j, n); svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
uint32_t active = svcntp_b32(svptrue_b32(), pg); uint32_t active = svcntp_b32(svptrue_b32(), pg);
do { do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2; aoffset += active * 2;
j += svcntw(); j += svcntw();
pg = svwhilelt_b32(j, n); pg = svwhilelt_b32((uint64_t)j, (uint64_t)n);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
BLASLONG sve_width = SVE_WIDTH; BLASLONG sve_width = SVE_WIDTH;
for (BLASLONG i = 0; i < n; i += sve_width * 2) { for (BLASLONG i = 0; i < n; i += sve_width * 2) {
svbool_t pg_a = SVE_WHILELT(i, n); svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
svbool_t pg_b = SVE_WHILELT(i + sve_width, n); svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);

View File

@ -0,0 +1,121 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64_t
#define SV_INDEX svuint64_t
#define SV_INDEXER svindex_u64
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32_t
#define SV_INDEX svuint32_t
#define SV_INDEXER svindex_u32
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \
a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \
svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \
a_offset_inner += 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
uint64_t sve_size;
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2);
SV_TYPE a_vec_real;
SV_TYPE a_vec_imag;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * lda * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) {
BLASLONG remaining_n = n - single_vectors_n; BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) { if (remaining_n) {
a_offset_inner = a_offset; a_offset_inner = a_offset;
svbool_t pg = SV_WHILE(0L, remaining_n); svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n; uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2; uint64_t i_cnt = m >> 2;
while (i_cnt--) { while (i_cnt--) {

View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdint.h>
#include <stdio.h>
#include <arm_sve.h>
#include "common.h"
#ifdef DOUBLE
#define COUNT "cntd"
#define SV_TYPE svfloat64x2_t
#define SV_TRUE svptrue_b64
#define SV_WHILE svwhilelt_b64
#else
#define COUNT "cntw"
#define SV_TYPE svfloat32x2_t
#define SV_TRUE svptrue_b32
#define SV_WHILE svwhilelt_b32
#endif
#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \
a_vec = svld2(pg, a_offset_inner); \
svst2(pg, b_offset, a_vec); \
a_offset_inner += lda * 2; \
b_offset += active * 2;
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
uint64_t sve_size = svcntw();
asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : );
IFLOAT *a_offset, *a_offset_inner, *b_offset;
a_offset = a;
b_offset = b;
SV_TYPE a_vec;
svbool_t pg_true = SV_TRUE();
BLASLONG single_vectors_n = n & -sve_size;
for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) {
a_offset_inner = a_offset;
svbool_t pg = pg_true;
uint64_t active = sve_size;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
a_offset += sve_size * 2;
}
BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) {
a_offset_inner = a_offset;
svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2;
while (i_cnt--) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 2) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
if (m & 1) {
INNER_COPY(pg, a_offset_inner, b_offset, lda, active);
}
}
return 0;
}

View File

@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
BLASLONG remaining_n = n - single_vectors_n; BLASLONG remaining_n = n - single_vectors_n;
if (remaining_n) { if (remaining_n) {
a_offset_inner = a_offset; a_offset_inner = a_offset;
svbool_t pg = SV_WHILE(0L, remaining_n); svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n);
uint64_t active = remaining_n; uint64_t active = remaining_n;
uint64_t i_cnt = m >> 2; uint64_t i_cnt = m >> 2;
while (i_cnt--) { while (i_cnt--) {

0
kernel/arm64/sgemm_beta.S Executable file → Normal file
View File

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#ifdef __NVCOMPILER
#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ )
#if (NVCOMPVERS < 2309)
#pragma opt 1
#endif
#endif
#include "common.h" #include "common.h"

View File

@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z15.d, p0/z, [pB, 56] ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64 add pB, pB, 64
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_M1 .macro KERNELv1x4_M1
@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ld1rd z15.d, p0/z, [pB, 56] ld1rd z15.d, p0/z, [pB, 56]
add pB, pB, 64 add pB, pB, 64
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_M2 .macro KERNELv1x4_M2
@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ri z23.d, p1/m, z2.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d
ld1rd z15.d, p0/z, [pB, 56] ld1rd z15.d, p0/z, [pB, 56]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
add pB, pB, 64 add pB, pB, 64
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_E .macro KERNELv1x4_E
@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ir z23.d, p1/m, z3.d, z14.d OP_ir z23.d, p1/m, z3.d, z14.d
OP_ii z22.d, p1/m, z3.d, z15.d OP_ii z22.d, p1/m, z3.d, z15.d
OP_ri z23.d, p1/m, z2.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
.endm .endm
.macro KERNELv1x4_SUB .macro KERNELv1x4_SUB
@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
OP_ir z23.d, p1/m, z1.d, z14.d OP_ir z23.d, p1/m, z1.d, z14.d
OP_ii z22.d, p1/m, z1.d, z15.d OP_ii z22.d, p1/m, z1.d, z15.d
OP_ri z23.d, p1/m, z0.d, z15.d OP_ri z23.d, p1/m, z0.d, z15.d
prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
.endm .endm
.macro SAVEv1x4 .macro SAVEv1x4
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0] ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I fmls z24.d, p1/m, z17.d, alphaz_I
@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2d {z26.d, z27.d}, p1, [pCRow1] st2d {z26.d, z27.d}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #4 add pCRow1, pCRow1, lanes, lsl #4
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
ld2d {z28.d, z29.d}, p1/z, [pCRow2] ld2d {z28.d, z29.d}, p1/z, [pCRow2]
fmla z28.d, p1/m, z20.d, alphaz_R fmla z28.d, p1/m, z20.d, alphaz_R
@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmla z31.d, p1/m, z23.d, alphaz_R fmla z31.d, p1/m, z23.d, alphaz_R
st2d {z30.d, z31.d}, p1, [pCRow3] st2d {z30.d, z31.d}, p1, [pCRow3]
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVEv1x2 .macro SAVEv1x2
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0] ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I fmls z24.d, p1/m, z17.d, alphaz_I
@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
st2d {z26.d, z27.d}, p1, [pCRow1] st2d {z26.d, z27.d}, p1, [pCRow1]
add pCRow1, pCRow1, lanes, lsl #4 add pCRow1, pCRow1, lanes, lsl #4
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endm .endm
.macro SAVEv1x1 .macro SAVEv1x1
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE]
ld2d {z24.d, z25.d}, p1/z, [pCRow0] ld2d {z24.d, z25.d}, p1/z, [pCRow0]
fmla z24.d, p1/m, z16.d, alphaz_R fmla z24.d, p1/m, z16.d, alphaz_R
fmls z24.d, p1/m, z17.d, alphaz_I fmls z24.d, p1/m, z17.d, alphaz_I
@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE]
.endm .endm
/******************************************************************************/ /******************************************************************************/
@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stp x26, x27, [sp, #(9 * 16)] stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)] str x28, [sp, #(10 * 16)]
prfm PLDL1KEEP, [origPB]
prfm PLDL1KEEP, [origPA]
fmov alphaR, d0 fmov alphaR, d0
dup alphaz_R, alphaR dup alphaz_R, alphaR
fmov alphaI, d1 fmov alphaI, d1
@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bne .Lzgemm_kernel_L4_Mv1_46 bne .Lzgemm_kernel_L4_Mv1_46
.Lzgemm_kernel_L4_Mv1_100: .Lzgemm_kernel_L4_Mv1_100:
prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB]
SAVEv1x4 SAVEv1x4
.Lzgemm_kernel_L4_Mv1_END: .Lzgemm_kernel_L4_Mv1_END:

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * lda * 2; aoffset += active * lda * 2;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
boffset = b; boffset = b;
j = 0; j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
uint64_t active = svcntp_b64(svptrue_b64(), pg); uint64_t active = svcntp_b64(svptrue_b64(), pg);
do { do {
@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
aoffset += active * 2; aoffset += active * 2;
j += svcntd(); j += svcntd();
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL); svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0; int32_t j = 0;
int32_t N = n; int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(cmp, gat_ind, lda_vec);
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0); svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag); data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b64(offset, 0LL); svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
#else #else
@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t j = 0; int32_t j = 0;
int32_t N = n; int32_t N = n;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec);
data_vec_imag = svneg_z(pg, data_vec_imag); data_vec_imag = svneg_z(pg, data_vec_imag);
if (offset <= 0) { if (offset <= 0) {
svbool_t off_g = svwhilelt_b32(offset, 0); svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0);
data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag);
} }
@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
svint64_t one_vec = svdup_s64(1LL); svint64_t one_vec = svdup_s64(1LL);
int64_t j = 0; int64_t j = 0;
svbool_t pg = svwhilelt_b64(j, n); svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
int64_t active = svcntp_b64(svptrue_b64(), pg); int64_t active = svcntp_b64(svptrue_b64(), pg);
svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index_neg = svindex_s64(0LL, -1LL);
svint64_t index = svindex_s64(0LL, 1LL); svint64_t index = svindex_s64(0LL, 1LL);
@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s64(posX); posX_vec = svdup_s64(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b64(j, n); pg = svwhilelt_b64((uint64_t)j, (uint64_t)n);
active = svcntp_b64(svptrue_b64(), pg); active = svcntp_b64(svptrue_b64(), pg);
} while (svptest_any(svptrue_b64(), pg)); } while (svptest_any(svptrue_b64(), pg));
@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
int32_t N = n; int32_t N = n;
int32_t j = 0; int32_t j = 0;
svbool_t pg = svwhilelt_b32(j, N); svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
int32_t active = svcntp_b32(svptrue_b32(), pg); int32_t active = svcntp_b32(svptrue_b32(), pg);
svint32_t index_neg = svindex_s32(0, -1); svint32_t index_neg = svindex_s32(0, -1);
svint32_t index = svindex_s32(0, 1); svint32_t index = svindex_s32(0, 1);
@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posX += sve_size; posX += sve_size;
posX_vec = svdup_s32(posX); posX_vec = svdup_s32(posX);
j += sve_size; j += sve_size;
pg = svwhilelt_b32(j, N); pg = svwhilelt_b32((uint32_t)j, (uint32_t)N);
active = svcntp_b32(svptrue_b32(), pg); active = svcntp_b32(svptrue_b32(), pg);
} while (svptest_any(svptrue_b32(), pg)); } while (svptest_any(svptrue_b32(), pg));

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
#ifdef DOUBLE #ifdef DOUBLE
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
FLOAT *ao; FLOAT *ao;
js = 0; js = 0;
#ifdef DOUBLE #ifdef DOUBLE
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
svbool_t pn = svwhilelt_b32(js, n); svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do do
@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
posY += n_active; posY += n_active;
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, n); pn = svwhilelt_b32((uint64_t)js, (uint64_t)n);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svint64_t index = svindex_s64(0LL, lda); svint64_t index = svindex_s64(0LL, lda);
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svint32_t index = svindex_s32(0, lda); svint32_t index = svindex_s32(0, lda);
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

View File

@ -1,5 +1,6 @@
/*********************************************************************/ /*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */ /* Copyright 2009, 2010 The University of Texas at Austin. */
/* Copyright 2023 The OpenBLAS Project */
/* All rights reserved. */ /* All rights reserved. */
/* */ /* */
/* Redistribution and use in source and binary forms, with or */ /* Redistribution and use in source and binary forms, with or */
@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
jj = offset; jj = offset;
#ifdef DOUBLE #ifdef DOUBLE
int64_t js = 0; int64_t js = 0;
svbool_t pn = svwhilelt_b64(js, n); svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
int n_active = svcntp_b64(svptrue_b64(), pn); int n_active = svcntp_b64(svptrue_b64(), pn);
#else #else
int32_t N = n; int32_t N = n;
int32_t js = 0; int32_t js = 0;
svbool_t pn = svwhilelt_b32(js, N); svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
int n_active = svcntp_b32(svptrue_b32(), pn); int n_active = svcntp_b32(svptrue_b32(), pn);
#endif #endif
do { do {
@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
js += n_active; js += n_active;
#ifdef DOUBLE #ifdef DOUBLE
pn = svwhilelt_b64(js, n); pn = svwhilelt_b64((uint64_t)js, (uint64_t)n);
n_active = svcntp_b64(svptrue_b64(), pn); n_active = svcntp_b64(svptrue_b64(), pn);
} while (svptest_any(svptrue_b64(), pn)); } while (svptest_any(svptrue_b64(), pn));
#else #else
pn = svwhilelt_b32(js, N); pn = svwhilelt_b32((uint32_t)js, (uint32_t)N);
n_active = svcntp_b32(svptrue_b32(), pn); n_active = svcntp_b32(svptrue_b32(), pn);
} while (svptest_any(svptrue_b32(), pn)); } while (svptest_any(svptrue_b32(), pn));
#endif #endif

0
kernel/generic/ztrmmkernel_4x4.c Executable file → Normal file
View File

View File

@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
BLASLONG i, ii, j, jj; BLASLONG i, ii, j, jj;
FLOAT data01, data02; FLOAT data01=0.0, data02=0.0;
FLOAT *a1; FLOAT *a1;
lda *= 2; lda *= 2;

View File

@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
FLOAT data05, data06, data07, data08; FLOAT data05, data06, data07, data08;
FLOAT *a1, *a2; FLOAT *a1, *a2;
data01=data02=data07=data08=0.0;
lda *= 2; lda *= 2;
jj = offset; jj = offset;

View File

@ -1,3 +1,4 @@
ifndef NO_LASX
DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMKERNEL = dgemm_kernel_16x4.S
DGEMMINCOPY = dgemm_ncopy_16.S DGEMMINCOPY = dgemm_ncopy_16.S
DGEMMITCOPY = dgemm_tcopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S
@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMVNKERNEL = dgemv_n_8_lasx.S
DGEMVTKERNEL = dgemv_t_8_lasx.S
SGEMMKERNEL = sgemm_kernel_16x8_lasx.S
SGEMMINCOPY = sgemm_ncopy_16_lasx.S
SGEMMITCOPY = sgemm_tcopy_16_lasx.S
SGEMMONCOPY = sgemm_ncopy_8_lasx.S
SGEMMOTCOPY = sgemm_tcopy_8_lasx.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c SGEMVNKERNEL = ../arm/gemv_n.c
ifndef DGEMVNKERNEL
DGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = ../arm/gemv_n.c
endif
CGEMVNKERNEL = ../arm/zgemv_n.c CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c SGEMVTKERNEL = ../arm/gemv_t.c
ifndef DGEMVTKERNEL
DGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = ../arm/gemv_t.c
endif
CGEMVTKERNEL = ../arm/zgemv_t.c CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c

View File

@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov.d s2, s1 fmov.d s2, s1
bge $r0, N, .L999 bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999 beq $r0, INCX, .L999
srai.d I, N, 2 srai.d I, N, 2
bge $r0, I, .L25 bge $r0, I, .L25
LD a1, X, 0 * SIZE LD a1, X, 0 * SIZE

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,546 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/07/14 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA $f0
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M8 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $xr1
#define X0 $xr2
#define X1 $xr3
#define X2 $xr4
#define X3 $xr5
#define X4 $xr6
#define X5 $xr7
#define X6 $xr8
#define X7 $xr9
#define Y0 $xr10
#define Y1 $xr11
#define A0 $xr12
#define A1 $xr13
#define A2 $xr14
#define A3 $xr15
#define A4 $xr16
#define A5 $xr17
#define A6 $xr18
#define A7 $xr19
#define A8 $xr20
#define A9 $xr21
#define A10 $xr22
#define A11 $xr23
#define A12 $xr24
#define A13 $xr25
#define A14 $xr26
#define A15 $xr27
.macro DLOAD_X_8
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
.endm
.macro DLOAD_X_4
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
.endm
.macro DLOAD_X_2
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
.endm
.macro DLOAD_X_1
GLDREPL xv, d, X0, X, 0x00
GMUL xvf, d, X0, X0, VALPHA
.endm
.macro DLOAD_Y_8
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro DLOAD_Y_4
GLD xv, , Y0, Y, 0
.endm
.macro DLOAD_Y_1
fld.d $f10, Y, 0
.endm
.macro DSTORE_Y_8
GST xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro DSTORE_Y_4
GST xv, , Y0, Y, 0
.endm
.macro DSTORE_Y_1
fst.d $f10, Y, 0
.endm
// Unable to use vector load/store ins
.macro DLOAD_Y_8_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f14, T0, 0
fldx.d $f15, T0, INC_Y
PTR_ALSL T0, INC_Y, Y, 2
fld.d $f11, T0, 0
fldx.d $f17, T0, INC_Y
PTR_ADD T0, T0, INC_Y
PTR_ADD T0, T0, INC_Y
fld.d $f18, T0, 0
fldx.d $f19, T0, INC_Y
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
.endm
.macro DLOAD_Y_4_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f14, T0, 0
fldx.d $f15, T0, INC_Y
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
.endm
.macro DSTORE_Y_8_GAP
xvstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 3
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 0
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 3
.endm
.macro DSTORE_Y_4_GAP
xvstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 3
.endm
.macro DLOAD_X_8_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X3, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X4, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X5, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X6, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X7, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
.endm
.macro DLOAD_X_4_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X3, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
.endm
.macro DLOAD_X_2_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
.endm
.macro DGEMV_N_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
Y0, A6, X3, Y0, Y1, A7, X3, Y1, \
Y0, A8, X4, Y0, Y1, A9, X4, Y1, \
Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
Y0, A14, X7, Y0, Y1, A15, X7, Y1
.endm
.macro DGEMV_N_4x8
GLD_INC xv, , 0x20, A0, PA0, 0, \
A2, PA1, 0, \
A4, PA2, 0, \
A6, PA3, 0, \
A8, PA4, 0, \
A10, PA5, 0, \
A12, PA6, 0, \
A14, PA7, 0
GMADD xvf, d, Y0, A0, X0, Y0, \
Y0, A2, X1, Y0, \
Y0, A4, X2, Y0, \
Y0, A6, X3, Y0, \
Y0, A8, X4, Y0, \
Y0, A10, X5, Y0, \
Y0, A12, X6, Y0, \
Y0, A14, X7, Y0
.endm
.macro DGEMV_N_1x8
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
GMADD f, d, $f10, $f12, $f2, $f10, \
$f10, $f14, $f3, $f10, \
$f10, $f16, $f4, $f10, \
$f10, $f18, $f5, $f10, \
$f10, $f20, $f6, $f10, \
$f10, $f22, $f7, $f10, \
$f10, $f24, $f8, $f10, \
$f10, $f26, $f9, $f10,
.endm
.macro DGEMV_N_8x4
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
Y0, A6, X3, Y0, Y1, A7, X3, Y1
.endm
.macro DGEMV_N_4x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \
Y0, A4, X2, Y0, Y0, A6, X3, Y0
.endm
.macro DGEMV_N_1x4
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
$f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
.endm
.macro DGEMV_N_8x2
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1
.endm
.macro DGEMV_N_4x2
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
GMADD xvf, d, Y0, A0, X0, Y0, \
Y0, A2, X1, Y0
.endm
.macro DGEMV_N_1x2
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
GMADD f, d, $f10, $f12, $f2, $f10, \
$f10, $f14, $f3, $f10
.endm
.macro DGEMV_N_1x1
fld.d $f12, PA0, 0
PTR_ADDI PA0, PA0, 0x08
fmadd.d $f10, $f12, $f2, $f10
.endm
.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
DLOAD_\X_8
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
DLOAD_\Y_8
DGEMV_N_8x8
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 3
PTR_ADDI K, K, 8
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
andi I, M, 4
beqz I, .L_\XW\()_M_3
DLOAD_\Y_4
DGEMV_N_4x8
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_M_3:
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
DLOAD_\Y_1
DGEMV_N_1x8
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
PTR_ALSL X, INC_X, X, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 4
beqz J, .L_\XW\()_N_3
DLOAD_\X_4
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_4_M_7
.align 5
.L_\XW\()_N_4_M_L8:
DLOAD_\Y_8
DGEMV_N_8x4
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ADDI K, K, 8
PTR_ALSL Y, INC_Y, Y, 3
bnez I, .L_\XW\()_N_4_M_L8
.L_\XW\()_N_4_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_4_M_3
DLOAD_\Y_4
DGEMV_N_4x4
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_N_4_M_3:
andi I, M, 3
beqz I, .L_\XW\()_N_4_M_END
.align 5
.L_\XW\()_N_4_M_L1:
DLOAD_\Y_1
DGEMV_N_1x4
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_4_M_L1
.L_\XW\()_N_4_M_END:
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_3:
andi J, N, 2
beqz J, .L_\XW\()_N_1
DLOAD_\X_2
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_2_M_7
.align 5
.L_\XW\()_N_2_M_L8:
DLOAD_\Y_8
DGEMV_N_8x2
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ADDI K, K, 8
PTR_ALSL Y, INC_Y, Y, 3
bnez I, .L_\XW\()_N_2_M_L8
.L_\XW\()_N_2_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_2_M_3
DLOAD_\Y_4
DGEMV_N_4x2
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_N_2_M_3:
andi I, M, 3
beqz I, .L_\XW\()_N_2_M_END
.align 5
.L_\XW\()_N_2_M_L1:
DLOAD_\Y_1
DGEMV_N_1x2
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_2_M_L1
.L_\XW\()_N_2_M_END:
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M8
PTR_ADD PA0, PA0, K_LDA
PTR_ADD PA1, PA1, K_LDA
PTR_ALSL X, INC_X, X, 1
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
DLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
DLOAD_\Y_1
DGEMV_N_1x1
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 24 + 4
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
xvreplve0.d VALPHA, $xr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 24 + 4
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,468 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/07/17 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA $f0
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M8 $r30
#define VALPHA $xr0
#define X0 $xr1
#define X1 $xr2
#define A0 $xr3
#define A1 $xr4
#define A2 $xr5
#define A3 $xr6
#define A4 $xr7
#define A5 $xr8
#define A6 $xr9
#define A7 $xr10
#define A8 $xr11
#define A9 $xr12
#define A10 $xr13
#define A11 $xr14
#define A12 $xr15
#define A13 $xr16
#define A14 $xr17
#define A15 $xr18
#define TP0 $xr19
#define TP1 $xr20
#define TP2 $xr21
#define TP3 $xr22
#define TP4 $xr23
#define TP5 $xr24
#define TP6 $xr25
#define TP7 $xr26
#define Y0 $xr3
#define Y1 $xr4
#define Y2 $xr5
#define Y3 $xr6
#define Y4 $xr7
#define Y5 $xr8
#define Y6 $xr9
#define Y7 $xr10
.macro ZERO_Y8
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
.endm
.macro ZERO_Y4
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
.endm
.macro ZERO_Y2
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
.endm
.macro ZERO_Y1
GXOR xv, v, TP0, TP0, TP0
.endm
.macro DLOAD_X8
GLD xv, , X0, X, 0x00, X1, X, 0x20
.endm
.macro DLOAD_X4
GLD xv, , X0, X, 0x00
.endm
.macro DLOAD_X8_GAP
fld.d $f1, X, 0x00
fldx.d $f2, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f3, T0, 0x00
fldx.d $f4, T0, INC_X
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
PTR_ALSL T0, INC_X, X, 2
fld.d $f2, T0, 0x00
fldx.d $f3, T0, INC_X
PTR_ALSL T0, INC_X, T0, 1
fld.d $f4, T0, 0x00
fldx.d $f5, T0, INC_X
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
.endm
.macro DLOAD_X4_GAP
fld.d $f1, X, 0x00
fldx.d $f2, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f3, T0, 0x00
fldx.d $f4, T0, INC_X
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
.endm
.macro DGEMV_T_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
TP3, A6, X0, TP3, TP3, A7, X1, TP3, \
TP4, A8, X0, TP4, TP4, A9, X1, TP4, \
TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
TP7, A14, X0, TP7, TP7, A15, X1, TP7
.endm
.macro DGEMV_T_8x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \
A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
TP2, A4, X0, TP2, TP3, A6, X0, TP3, \
TP4, A8, X0, TP4, TP5, A10, X0, TP5, \
TP6, A12, X0, TP6, TP7, A14, X0, TP7,
.endm
.macro DGEMV_T_4x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
TP3, A6, X0, TP3, TP3, A7, X1, TP3
.endm
.macro DGEMV_T_4x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
TP2, A4, X0, TP2, TP3, A6, X0, TP3
.endm
.macro DGEMV_T_2x8
GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1
.endm
.macro DGEMV_T_2x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1
.endm
.macro DGEMV_T XW:req X8:req, X4:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
ZERO_Y8
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
DLOAD_\X8
DGEMV_T_8x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
andi I, M, 4
beqz I, .L_\XW\()_M_3
DLOAD_\X4
DGEMV_T_8x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
Y5, TP5, Y6, TP6, Y7, TP7
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
fld.d $f1, X, 0x00
fld.d $f11, PA0, 0x00
fld.d $f12, PA1, 0x00
fld.d $f13, PA2, 0x00
fld.d $f14, PA3, 0x00
fld.d $f15, PA4, 0x00
fld.d $f16, PA5, 0x00
fld.d $f17, PA6, 0x00
fld.d $f18, PA7, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#else
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#endif
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
$f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
PTR_ALSL PY1, INC_Y, Y, 2
fld.d $f15, PY1, 0x00
fldx.d $f16, PY1, INC_Y
PTR_ALSL PY2, INC_Y, PY1, 1
fld.d $f17, PY2, 0x00
fldx.d $f18, PY2, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
$f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
fst.d $f15, PY1, 0x00
fstx.d $f16, PY1, INC_Y
fst.d $f17, PY2, 0x00
fstx.d $f18, PY2, INC_Y
PTR_ALSL Y, INC_Y, Y, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 4
beqz J, .L_\XW\()_N_3
ZERO_Y4
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_4_M_7
.align 5
.L_\XW\()_N_4_M_L8:
DLOAD_\X8
DGEMV_T_4x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_N_4_M_L8
.L_\XW\()_N_4_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_4_M_3
DLOAD_\X4
DGEMV_T_4x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_4_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
andi I, M, 3
beqz I, .L_\XW\()_N_4_M_END
.align 5
.L_\XW\()_N_4_M_L1:
fld.d $f1, X, 0x00
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_N_4_M_L1
.L_\XW\()_N_4_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
PTR_ALSL Y, INC_Y, Y, 2
.L_\XW\()_N_3:
andi J, N, 2
beqz J, .L_\XW\()_N_1
ZERO_Y2
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_2_M_7
.align 5
.L_\XW\()_N_2_M_L8:
DLOAD_\X8
DGEMV_T_2x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_N_2_M_L8
.L_\XW\()_N_2_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_2_M_3
DLOAD_\X4
DGEMV_T_2x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_2_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1
andi I, M, 3
beqz I, .L_\XW\()_N_2_M_END
.align 5
.L_\XW\()_N_2_M_L1:
fld.d $f1, X, 0x00
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_N_2_M_L1
.L_\XW\()_N_2_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
PTR_ALSL Y, INC_Y, Y, 1
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
fld.d $f3, PA0, 0x00
fld.d $f1, X, 0x00
fmadd.d $f19, $f3, $f1, $f19
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x08
bnez I, .L_\XW\()_N_1_M_L1
fld.d $f3, Y, 0x00
fmadd.d $f3, ALPHA, $f19, $f3
fst.d $f3, Y, 0x00
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 24 + 3
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
xvreplve0.d VALPHA, $xr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
DGEMV_T GAP_0, X8, X4
.L_GAP_1: /* if (incx != 1) */
DGEMV_T GAP_1, X8_GAP, X4_GAP
.L_END:
pop_if_used 17 + 8, 24 + 3
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MTC s1, $r0 MTC s1, $r0
bge $r0, N, .L999 bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999 beq $r0, INCX, .L999
move XX, X move XX, X
NOP NOP
LD a1, X, 0 * SIZE LD a1, X, 0 * SIZE

View File

@ -0,0 +1,407 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#if __loongarch_grlen == 64
#define LA_REG int64_t
#define REG_SIZE 8
#define REG_LOG 3
#define PTR_ADDI addi.d
#define PTR_ADD add.d
#define PTR_SUB sub.d
#define PTR_LD ld.d
#define PTR_ST st.d
#define PTR_SLLI slli.d
#define PTR_SRLI srli.d
#define PTR_SRAI srai.d
#define PTR_MUL mul.d
#define PTR_ALSL alsl.d
#else
#define LA_REG int32_t
#define REG_SIZE 4
#define REG_LOG 2
#define PTR_ADDI addi.w
#define PTR_ADD add.w
#define PTR_SUB sub.w
#define PTR_LD ld.w
#define PTR_ST st.w
#define PTR_SLLI slli.w
#define PTR_SRLI srli.w
#define PTR_SRAI srai.w
#define PTR_MUL mul.w
#define PTR_ALSL alsl.w
#endif
#if __loongarch_frlen == 64
#define FREG_SIZE 8
#define FREG_LOG 3
#define PTR_FLD fld.d
#define PTR_FST fst.d
#else
#define FREG_SIZE 4
#define FREG_LOG 2
#define PTR_FLD fld.s
#define PTR_FST fst.s
#endif
// The max registers available to the user which
// do not need to be preserved across calls.
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
#define MAX_INT_CALLER_SAVED 17
#define MAX_FP_CALLER_SAVED 24
.altmacro // Enable alternate macro mode
.macro push_if_used regs, fregs
.if \regs > MAX_INT_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
.endif
.if \fregs > MAX_FP_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
.endif
.endm // End push_if_used
.macro pop_if_used regs, fregs
.if \fregs > MAX_FP_CALLER_SAVED
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
.endif
.if \regs > MAX_INT_CALLER_SAVED
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
.endif
.endm // End pop_if_used
.macro push_regs from, to
PTR_ST $s\()\from, $sp, \from << REG_LOG
.if \to - \from
push_regs %from + 1, \to
.endif
.endm // End push_regs
.macro pop_regs from, to
PTR_LD $s\()\from, $sp, \from << REG_LOG
.if \to - \from
pop_regs %from + 1, \to
.endif
.endm // End pop_regs
.macro push_fregs from, to
PTR_FST $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
push_fregs %from + 1, \to
.endif
.endm // End push_fregs
.macro pop_fregs from, to
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
pop_fregs %from + 1, \to
.endif
.endm // End pop_fregs
//
// Instruction Related Macros
//
// GLD
//
.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ld \out, \src, \offset
.else
\pre_op\()ld.\suf_op \out, \src, \offset
.endif
.ifnb \more
GLD \pre_op, \suf_op, \more
.endif
.endm
//
// GLD_INC
//
.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ld \out, \src, \offset
.else
\pre_op\()ld.\suf_op \out, \src, \offset
.endif
PTR_ADDI \src, \src, \inc
.ifnb \more
GLD_INC \pre_op, \suf_op, \inc, \more
.endif
.endm
//
// GLDX is same as GLD except the stride is a register
//
.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ldx \out, \src, \offset
.else
\pre_op\()ldx.\suf_op \out, \src, \offset
.endif
.ifnb \more
GLDX \pre_op, \suf_op, \more
.endif
.endm
//
// GLDREPL
//
.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
\pre_op\()ldrepl.\suf_op \out, \src, \offset
.ifnb \more
GLDREPL \pre_op, \suf_op, \more
.endif
.endm
//
// GST
//
.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()st \src, \dst, \offset
.else
\pre_op\()st.\suf_op \src, \dst, \offset
.endif
.ifnb \more
GST \pre_op, \suf_op, \more
.endif
.endm
//
// GMUL
//
.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()mul.\suf_op \out, \in0, \in1
.ifnb \more
GMUL \pre_op, \suf_op, \more
.endif
.endm
//
// GMADD
//
.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
\pre_op\()madd.\suf_op \out, \in0, \in1, \in2
.ifnb \more
GMADD \pre_op, \suf_op, \more
.endif
.endm
//
// GADD
//
.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()add.\suf_op \out, \in0, \in1
.ifnb \more
GADD \pre_op, \suf_op, \more
.endif
.endm
//
// GADDI
//
.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()addi.\suf_op \out, \in0, \in1
.ifnb \more
GADDI \pre_op, \suf_op, \more
.endif
.endm
//
// GSUB
//
.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()sub.\suf_op \out, \in0, \in1
.ifnb \more
GSUB \pre_op, \suf_op, \more
.endif
.endm
//
// GSLLI
//
.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()slli.\suf_op \out, \in0, \in1
.ifnb \more
GSLLI \pre_op, \suf_op, \more
.endif
.endm
//
// GINSVE0
//
.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()insve0.\suf_op \out, \in0, \in1
.ifnb \more
GINSVE0 \pre_op, \suf_op, \more
.endif
.endm
//
// GXOR
//
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()xor.\suf_op \out, \in0, \in1
.ifnb \more
GXOR \pre_op, \suf_op, \more
.endif
.endm
//
// GPERMI
//
.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()permi.\suf_op \out, \in0, \in1
.ifnb \more
GPERMI \pre_op, \suf_op, \more
.endif
.endm
//
// GNMSUB
//
.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
\pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2
.ifnb \more
GNMSUB \pre_op, \suf_op, \more
.endif
.endm
//
// GPRELD
//
.macro GPRELD in0:req, in1:req, in2:req, more:vararg
preld \in0, \in1, \in2
.ifnb \more
GPRELD \more
.endif
.endm
//
// Compound instructions
//
// GACC: Accumulate the values of vector registers
//
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "vf"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "xv"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
xvpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifeqs "\pre_op", "v"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
vpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifnb \more
GACC \pre_op, \suf_op, \more
.endif
.endm
//
// GMOV
//
.macro GMOV pre_op:req, out:req, in:req, more:vararg
\pre_op\()or.v \out, \in, \in
.ifnb \more
GMOV \pre_op, \more
.endif
.endm
//
// Media Related Macros
//
.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1
\pre_op\()ilvl.\suf_op \out0, \in0, \in1
\pre_op\()ilvh.\suf_op \out1, \in0, \in1
.endm
.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1
\pre_op\()pickev.\suf_op \out0, \in0, \in1
\pre_op\()pickod.\suf_op \out1, \in0, \in1
.endm
//
// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors,
// has no pre_op param. 128-bit vector instructions are not supported.
//
.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
vt0, vt1
GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0
GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2
GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3
GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02
.endm
.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \
in0, in1, in2, in3, in4, in5, in6, in7, \
tmp0, tmp1, tmp2, tmp3
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1
GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0
GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2
GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4
GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5
GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0
GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2
GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3
GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \
\out2, \out6, 0x02, \out3, \out7, 0x02, \
\out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \
\out6, \tmp2, 0x31, \out7, \tmp3, 0x31
.endm

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,463 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/08/23 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*********************************************************************/
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define S9 $r20
#define S10 $r23
#define S11 $r24
#define S12 $r25
#define S13 $r26
#define S14 $r27
#define S15 $r28
#define S16 $r29
#define TD $r30
#define TS $r31
#define TL $r7
#define T0 $r6
#undef ZERO
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define U8 $xr8
#define U9 $xr9
#define U10 $xr10
#define U11 $xr11
#define U12 $xr12
#define U13 $xr13
#define U14 $xr14
#define U15 $xr15
#define D0 $xr16
#define D1 $xr17
#define D2 $xr18
#define D3 $xr19
#define D4 $xr20
#define D5 $xr21
#define D6 $xr22
#define D7 $xr23
#define D8 $xr24
#define D9 $xr25
#define D10 $xr26
#define D11 $xr27
#define D12 $xr28
#define D13 $xr29
#define D14 $xr30
#define D15 $xr31
// Loops outline
//.L_N16 <-------------------
//| .L_M8: |
//| .L_M7: | Main Loop
//| .L_M1: |
//| .L_M0: ---------------
//.L_N15:
//.L_N8:
//| .L_N8_M8:
//| .L_N8_M7:
//| .L_N8_M1:
//.L_N7:
//.L_N4:
//| .L_N4_M4:
//| .L_N4_M3:
//| .L_N4_M1:
//.L_N3:
//.L_N2:
//| .L_N2_M2:
//| .L_N2_M1:
//.L_N1:
//| .L_N1_M1:
//.L_N0
PROLOGUE
push_if_used 26, 32
move TD, DST
move TS, SRC
PTR_SLLI TL, LDA, 0x02
PTR_SLLI T0, TL, 0x01
PTR_SRAI J, N, 0x04
beq J, ZERO, .L_N15
.align 5
.L_N16:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x03
PTR_ADD S3, S2, TL
PTR_ADDI J, J, -1
PTR_ADD S4, S3, TL
PTR_ADD S5, S3, T0
PTR_ADD S6, S4, T0
PTR_ADD S7, S5, T0
PTR_ADD S8, S6, T0
PTR_ADD S9, S7, T0
PTR_ADD S10, S8, T0
PTR_ADD S11, S9, T0
PTR_ADD S12, S10, T0
PTR_ADD S13, S11, T0
PTR_ADD S14, S12, T0
PTR_ADD S15, S13, T0
PTR_ADD S16, S14, T0
PTR_ADD TS, S15, T0
beq I, ZERO, .L_M7
.align 5
.L_M8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvld U8, S9, 0x00
xvld U9, S10, 0x00
xvld U10, S11, 0x00
xvld U11, S12, 0x00
xvld U12, S13, 0x00
xvld U13, S14, 0x00
xvld U14, S15, 0x00
xvld U15, S16, 0x00
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
U0, U1, U2, U3, U4, U5, U6, U7, \
D1, D3, D5, D7 // As tmp
GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \
U8, U9, U10, U11, U12, U13, U14, U15, \
U0, U1, U2, U3 // As tmp
GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \
D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0
PTR_ADDI TD, TD, 0x100
GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \
D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0
PTR_ADDI TD, TD, 0x100
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI S5, S5, 0x20
PTR_ADDI S6, S6, 0x20
PTR_ADDI S7, S7, 0x20
PTR_ADDI S8, S8, 0x20
PTR_ADDI S9, S9, 0x20
PTR_ADDI S10, S10, 0x20
PTR_ADDI S11, S11, 0x20
PTR_ADDI S12, S12, 0x20
PTR_ADDI S13, S13, 0x20
PTR_ADDI S14, S14, 0x20
PTR_ADDI S15, S15, 0x20
PTR_ADDI S16, S16, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_M8
.L_M7:
andi I, M, 0x07
beq I, ZERO, .L_M0
.align 5
.L_M1:
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fld.s F2, S3, 0x00
fld.s F3, S4, 0x00
fld.s F4, S5, 0x00
fld.s F5, S6, 0x00
fld.s F6, S7, 0x00
fld.s F7, S8, 0x00
fst.s F0, TD, 0x00
fst.s F1, TD, 0x04
fst.s F2, TD, 0x08
fst.s F3, TD, 0x0C
fst.s F4, TD, 0x10
fst.s F5, TD, 0x14
fst.s F6, TD, 0x18
fst.s F7, TD, 0x1C
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI S3, S3, 0x04
PTR_ADDI S4, S4, 0x04
PTR_ADDI S5, S5, 0x04
PTR_ADDI S6, S6, 0x04
PTR_ADDI S7, S7, 0x04
PTR_ADDI S8, S8, 0x04
PTR_ADDI TD, TD, 0x20
fld.s F0, S9, 0x00
fld.s F1, S10, 0x00
fld.s F2, S11, 0x00
fld.s F3, S12, 0x00
fld.s F4, S13, 0x00
fld.s F5, S14, 0x00
fld.s F6, S15, 0x00
fld.s F7, S16, 0x00
fst.s F0, TD, 0x00
fst.s F1, TD, 0x04
fst.s F2, TD, 0x08
fst.s F3, TD, 0x0C
fst.s F4, TD, 0x10
fst.s F5, TD, 0x14
fst.s F6, TD, 0x18
fst.s F7, TD, 0x1C
PTR_ADDI S9, S9, 0x04
PTR_ADDI S10, S10, 0x04
PTR_ADDI S11, S11, 0x04
PTR_ADDI S12, S12, 0x04
PTR_ADDI S13, S13, 0x04
PTR_ADDI S14, S14, 0x04
PTR_ADDI S15, S15, 0x04
PTR_ADDI S16, S16, 0x04
PTR_ADDI TD, TD, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_M1
.L_M0:
blt ZERO, J, .L_N16
.L_N15:
andi J, N, 0x0f
beq ZERO, J, .L_N0
andi J, N, 0x08
beq ZERO, J, .L_N7
.L_N8:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x03
PTR_ADD S3, S2, TL
PTR_ADD S4, S2, T0
PTR_ADD S5, S3, T0
PTR_ADD S6, S4, T0
PTR_ADD S7, S5, T0
PTR_ADD S8, S6, T0
PTR_ADD TS, S7, T0
beq I, ZERO, .L_N8_M7
.align 5
.L_N8_M8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
U0, U1, U2, U3, U4, U5, U6, U7, \
D1, D3, D5, D7 // As tmp
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
PTR_ADDI TD, TD, 0x100
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI S5, S5, 0x20
PTR_ADDI S6, S6, 0x20
PTR_ADDI S7, S7, 0x20
PTR_ADDI S8, S8, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_N8_M8
.L_N8_M7:
andi I, M, 0x07
beq I, ZERO, .L_N7
.align 5
.L_N8_M1:
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fld.s F2, S3, 0x00
fld.s F3, S4, 0x00
fld.s F4, S5, 0x00
fld.s F5, S6, 0x00
fld.s F6, S7, 0x00
fld.s F7, S8, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
fst.s F2, TD, 0x08
PTR_ADDI S3, S3, 0x04
fst.s F3, TD, 0x0C
PTR_ADDI S4, S4, 0x04
fst.s F4, TD, 0x10
PTR_ADDI S5, S5, 0x04
fst.s F5, TD, 0x14
PTR_ADDI S6, S6, 0x04
fst.s F6, TD, 0x18
PTR_ADDI S7, S7, 0x04
fst.s F7, TD, 0x1C
PTR_ADDI S8, S8, 0x04
PTR_ADDI TD, TD, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_N8_M1
.L_N7:
andi J, N, 0x07
beq ZERO, J, .L_N0
andi J, N, 0x04
beq ZERO, J, .L_N3
.L_N4:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x02
PTR_ADD S3, S2, TL
PTR_ADD S4, S2, T0
PTR_ADD TS, S3, T0
beq I, ZERO, .L_N4_M3
.align 5
.L_N4_M4:
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI TD, TD, 0x40
PTR_ADDI I, I, -1
blt ZERO, I, .L_N4_M4
.L_N4_M3:
andi I, M, 0x03
beq I, ZERO, .L_N3
.align 5
.L_N4_M1:
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fld.s F2, S3, 0x00
fld.s F3, S4, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
fst.s F2, TD, 0x08
PTR_ADDI S3, S3, 0x04
fst.s F3, TD, 0x0C
PTR_ADDI S4, S4, 0x04
PTR_ADDI TD, TD, 0x10
PTR_ADDI I, I, -1
blt ZERO, I, .L_N4_M1
.L_N3:
andi J, N, 0x03
beq ZERO, J, .L_N0
andi J, N, 0x02
beq ZERO, J, .L_N1
.L_N2:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x01
PTR_ADD TS, S2, TL
beq I, ZERO, .L_N2_M1
.align 5
.L_N2_M2:
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
vilvl.w $vr0, $vr1, $vr0
GST v, , $vr0, TD, 0x00
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI TD, TD, 0x10
PTR_ADDI I, I, -1
blt ZERO, I, .L_N2_M2
.L_N2_M1:
andi I, M, 0x01
beq I, ZERO, .L_N1
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI TD, TD, 0x08
.align 5
.L_N1:
move S1, TS
beq ZERO, M, .L_N0
.L_N1_M1:
fld.s F0, S1, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F0, TD, 0x00
PTR_ADDI TD, TD, 0x04
PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1
.L_N0:
pop_if_used 26, 32
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,298 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/08/23 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*********************************************************************/
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define TD $r20
#define TS $r11
#define TL $r7
#define T0 $r6
#undef ZERO
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define D0 $xr8
#define D1 $xr9
#define D2 $xr10
#define D3 $xr11
#define D4 $xr12
#define D5 $xr13
#define D6 $xr14
#define D7 $xr15
#define D8 $xr16
#define D10 $xr17
#define D12 $xr18
#define D14 $xr19
// Loops outline
//.L_N8: <----------------
//| .L_M8: |
//| .L_M7: | Main Loop
//| .L_M1: |
//| .L_M0:--------------
//.L_N7:
//.L_N4:
//| .L_N4_M4:
//| .L_N4_M3:
//| .L_N4_M1:
//.L_N3:
//.L_N2:
//| .L_N2_M2:
//| .L_N2_M1:
//.L_N1:
//| .L_N1_M1:
//.L_N0
PROLOGUE
push_if_used 17, 20
move TD, DST
move TS, SRC
PTR_SLLI TL, LDA, 0x02
PTR_SLLI T0, TL, 0x01
PTR_SRAI J, N, 0x03
beq J, ZERO, .L_N7
.align 5
.L_N8:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x03
PTR_ADD S3, S2, TL
PTR_ADDI J, J, -1
PTR_ADD S4, S2, T0
PTR_ADD S5, S3, T0
PTR_ADD S6, S4, T0
PTR_ADD S7, S5, T0
PTR_ADD S8, S6, T0
PTR_ADD TS, S7, T0
beq I, ZERO, .L_M7
.align 5
.L_M8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \
U0, U1, U2, U3, U4, U5, U6, U7, \
D1, D3, D5, D7 // As tmp
GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \
D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0
PTR_ADDI TD, TD, 0x100
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI S5, S5, 0x20
PTR_ADDI S6, S6, 0x20
PTR_ADDI S7, S7, 0x20
PTR_ADDI S8, S8, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_M8
.L_M7:
andi I, M, 0x07
beq I, ZERO, .L_M0
.align 5
.L_M1:
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fld.s F2, S3, 0x00
fld.s F3, S4, 0x00
fld.s F4, S5, 0x00
fld.s F5, S6, 0x00
fld.s F6, S7, 0x00
fld.s F7, S8, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
fst.s F2, TD, 0x08
PTR_ADDI S3, S3, 0x04
fst.s F3, TD, 0x0C
PTR_ADDI S4, S4, 0x04
fst.s F4, TD, 0x10
PTR_ADDI S5, S5, 0x04
fst.s F5, TD, 0x14
PTR_ADDI S6, S6, 0x04
fst.s F6, TD, 0x18
PTR_ADDI S7, S7, 0x04
fst.s F7, TD, 0x1C
PTR_ADDI S8, S8, 0x04
PTR_ADDI TD, TD, 0x20
PTR_ADDI I, I, -1
blt ZERO, I, .L_M1
.L_M0:
blt ZERO, J, .L_N8
.L_N7:
andi J, N, 0x07
beq ZERO, J, .L_N0
andi J, N, 0x04
beq ZERO, J, .L_N3
.L_N4:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x02
PTR_ADD S3, S2, TL
PTR_ADD S4, S2, T0
PTR_ADD TS, S3, T0
beq I, ZERO, .L_N4_M3
.align 5
.L_N4_M4:
GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0
GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0
GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1
GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4
GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5
GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI TD, TD, 0x40
PTR_ADDI I, I, -1
blt ZERO, I, .L_N4_M4
.L_N4_M3:
andi I, M, 0x03
beq I, ZERO, .L_N3
.align 5
.L_N4_M1:
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fld.s F2, S3, 0x00
fld.s F3, S4, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
fst.s F2, TD, 0x08
PTR_ADDI S3, S3, 0x04
fst.s F3, TD, 0x0C
PTR_ADDI S4, S4, 0x04
PTR_ADDI TD, TD, 0x10
PTR_ADDI I, I, -1
blt ZERO, I, .L_N4_M1
.L_N3:
andi J, N, 0x03
beq ZERO, J, .L_N0
andi J, N, 0x02
beq ZERO, J, .L_N1
.L_N2:
move S1, TS
PTR_ADD S2, TS, TL
PTR_SRAI I, M, 0x01
PTR_ADD TS, S2, TL
beq I, ZERO, .L_N2_M1
.align 5
.L_N2_M2:
GLD f, d, F0, S1, 0x00, F1, S2, 0x00
vilvl.w $vr0, $vr1, $vr0
GST v, , $vr0, TD, 0x00
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI TD, TD, 0x10
PTR_ADDI I, I, -1
blt ZERO, I, .L_N2_M2
.L_N2_M1:
andi I, M, 0x01
beq I, ZERO, .L_N1
fld.s F0, S1, 0x00
fld.s F1, S2, 0x00
fst.s F0, TD, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F1, TD, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI TD, TD, 0x08
.align 5
.L_N1:
move S1, TS
beq ZERO, M, .L_N0
.L_N1_M1:
fld.s F0, S1, 0x00
PTR_ADDI S1, S1, 0x04
fst.s F0, TD, 0x00
PTR_ADDI TD, TD, 0x04
PTR_ADDI M, M, -1
blt ZERO, M, .L_N1_M1
.L_N0:
pop_if_used 17, 20
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,526 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/08/23 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*********************************************************************/
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S0 $r11
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define P0 $r20
#define P1 $r23
#define P2 $r24
#define P3 $r25
#define P4 $r26
#define P5 $r27
#define T0 $r28
#define T1 $r29
#define TL $r7
#define ZERO $r0
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
// Loops outline
//.L_M8 <-------------------
//| .L_N16: |
//| .L_N15: |
//| .L_N8: |
//| .L_N7: | Main Loop
//| .L_N4: |
//| .L_N3: |
//| .L_N2: |
//| .L_N1: |
//| .L_N0: ---------------
//.L_M7
//.L_M4
//| .L_M4_N16:
//| .L_M4_N15:
//| .L_M4_N8:
//| .L_M4_N7:
//| .L_M4_N4:
//| .L_M4_N3:
//| .L_M4_N2:
//| .L_M4_N1:
//.L_M3
//.L_M2
//| .L_M2_N16:
//| .L_M2_N15:
//| .L_M2_N8:
//| .L_M2_N7:
//| .L_M2_N4:
//| .L_M2_N3:
//| .L_M2_N2:
//| .L_M2_N1:
//.L_M1
//| .L_M1_N16:
//| .L_M1_N15:
//| .L_M1_N8:
//| .L_M1_N7:
//| .L_M1_N4:
//| .L_M1_N3:
//| .L_M1_N2:
//| .L_M1_N1:
//.L_M0
PROLOGUE
push_if_used 24, 8
move S0, SRC
move P0, DST
PTR_SRAI T0, N, 0x04
PTR_SRAI T1, N, 0x03
PTR_SLLI T0, T0, 0x04
PTR_SLLI T1, T1, 0x03
PTR_MUL P2, M, T0
PTR_MUL P3, M, T1
PTR_SLLI P2, P2, 0x02
PTR_SLLI P3, P3, 0x02
PTR_ADD P2, DST, P2
PTR_ADD P3, DST, P3
PTR_SRAI T0, N, 0x02
PTR_SRAI T1, N, 0x01
PTR_SLLI T0, T0, 0x02
PTR_SLLI T1, T1, 0x01
PTR_MUL P4, M, T0
PTR_MUL P5, M, T1
PTR_SLLI P4, P4, 0x02
PTR_SLLI P5, P5, 0x02
PTR_ADD P4, DST, P4
PTR_ADD P5, DST, P5
PTR_SLLI TL, LDA, 0x02
PTR_SRAI J, M, 0x03
PTR_SLLI T0, TL, 0x01
PTR_SLLI T1, M, 0x06
beq ZERO, J, .L_M7
.align 5
.L_M8:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S3, S1, T0
PTR_ADD S4, S2, T0
PTR_ADD S5, S3, T0
PTR_ADD S6, S4, T0
PTR_ADD S7, S5, T0
PTR_ADD S8, S6, T0
PTR_ADD S0, S7, T0
move P1, P0
PTR_ADDI P0, P0, 0x200
PTR_SRAI I, N, 0x04
PTR_ADDI J, J, -1
beq ZERO, I, .L_N15
.L_N16:
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
xvld U4, S3, 0x00
xvld U5, S3, 0x20
xvld U6, S4, 0x00
xvld U7, S4, 0x20
xvst U4, P1, 0x80
xvst U5, P1, 0xA0
xvst U6, P1, 0xC0
xvst U7, P1, 0xE0
xvld U0, S5, 0x00
xvld U1, S5, 0x20
xvld U2, S6, 0x00
xvld U3, S6, 0x20
xvst U0, P1, 0x100
xvst U1, P1, 0x120
xvst U2, P1, 0x140
xvst U3, P1, 0x160
xvld U4, S7, 0x00
xvld U5, S7, 0x20
xvld U6, S8, 0x00
xvld U7, S8, 0x20
xvst U4, P1, 0x180
xvst U5, P1, 0x1A0
xvst U6, P1, 0x1C0
xvst U7, P1, 0x1E0
PTR_ADDI S1, S1, 0x40
PTR_ADDI S2, S2, 0x40
PTR_ADDI S3, S3, 0x40
PTR_ADDI S4, S4, 0x40
PTR_ADDI S5, S5, 0x40
PTR_ADDI S6, S6, 0x40
PTR_ADDI S7, S7, 0x40
PTR_ADDI S8, S8, 0x40
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_N16
.L_N15:
andi I, N, 0x08
beq ZERO, I, .L_N7
.L_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \
U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI S5, S5, 0x20
PTR_ADDI S6, S6, 0x20
PTR_ADDI S7, S7, 0x20
PTR_ADDI S8, S8, 0x20
PTR_ADDI P2, P2, 0x100
.L_N7:
andi I, N, 0x04
beq ZERO, I, .L_N3
.L_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \
$vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI S5, S5, 0x10
PTR_ADDI S6, S6, 0x10
PTR_ADDI S7, S7, 0x10
PTR_ADDI S8, S8, 0x10
PTR_ADDI P3, P3, 0x80
.L_N3:
andi I, N, 0x02
beq ZERO, I, .L_N1
.L_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \
$f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI S3, S3, 0x08
PTR_ADDI S4, S4, 0x08
PTR_ADDI S5, S5, 0x08
PTR_ADDI S6, S6, 0x08
PTR_ADDI S7, S7, 0x08
PTR_ADDI S8, S8, 0x08
PTR_ADDI P4, P4, 0x40
.L_N1:
andi I, N, 0x01
beq ZERO, I, .L_N0
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \
$f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI S3, S3, 0x04
PTR_ADDI S4, S4, 0x04
PTR_ADDI S5, S5, 0x04
PTR_ADDI S6, S6, 0x04
PTR_ADDI S7, S7, 0x04
PTR_ADDI S8, S8, 0x04
PTR_ADDI P5, P5, 0x20
.L_N0:
blt ZERO, J, .L_M8
.L_M7:
andi J, M, 0x04
beq ZERO, J, .L_M3
.L_M4:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S3, S1, T0
PTR_ADD S4, S2, T0
PTR_ADD S0, S3, T0
move P1, P0
PTR_ADDI P0, P0, 0x100
PTR_SRAI I, N, 0x04
beq ZERO, I, .L_M4_N15
.align 5
.L_M4_N16:
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
xvld U4, S3, 0x00
xvld U5, S3, 0x20
xvld U6, S4, 0x00
xvld U7, S4, 0x20
xvst U4, P1, 0x80
xvst U5, P1, 0xA0
xvst U6, P1, 0xC0
xvst U7, P1, 0xE0
PTR_ADDI S1, S1, 0x40
PTR_ADDI S2, S2, 0x40
PTR_ADDI S3, S3, 0x40
PTR_ADDI S4, S4, 0x40
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M4_N16
.L_M4_N15:
andi I, N, 0x08
beq ZERO, I, .L_M4_N7
.L_M4_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI P2, P2, 0x80
.L_M4_N7:
andi I, N, 0x04
beq ZERO, I, .L_M4_N3
.L_M4_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI P3, P3, 0x40
.L_M4_N3:
andi I, N, 0x02
beq ZERO, I, .L_M4_N1
.L_M4_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI S3, S3, 0x08
PTR_ADDI S4, S4, 0x08
PTR_ADDI P4, P4, 0x20
.L_M4_N1:
andi I, N, 0x01
beq ZERO, I, .L_M3
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI S3, S3, 0x04
PTR_ADDI S4, S4, 0x04
PTR_ADDI P5, P5, 0x10
.L_M3:
andi J, M, 0x02
beq ZERO, J, .L_M1
.L_M2:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S0, S0, T0
move P1, P0
PTR_ADDI P0, P0, 0x80
PTR_SRAI I, N, 0x04
beq ZERO, I, .L_M2_N15
.align 5
.L_M2_N16:
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
PTR_ADDI S1, S1, 0x40
PTR_ADDI S2, S2, 0x40
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M2_N16
.L_M2_N15:
andi I, N, 0x08
beq ZERO, I, .L_M2_N7
.L_M2_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
GST xv, , U0, P2, 0x00, U1, P2, 0x20
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI P2, P2, 0x40
.L_M2_N7:
andi I, N, 0x04
beq ZERO, I, .L_M2_N3
.L_M2_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI P3, P3, 0x20
.L_M2_N3:
andi I, N, 0x02
beq ZERO, I, .L_M2_N1
.L_M2_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
GST f, d, $f0, P4, 0x00, $f1, P4, 0x08
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI P4, P4, 0x10
.L_M2_N1:
andi I, N, 0x01
beq ZERO, I, .L_M1
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
GST f, s, $f0, P5, 0x00, $f1, P5, 0x04
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI P5, P5, 0x08
.L_M1:
andi J, M, 0x01
beq ZERO, J, .L_M0
move S1, S0
PTR_ADD S2, S0, TL
move P1, P0
PTR_ADDI P0, P0, 0x40
PTR_SRAI I, N, 0x04
beq ZERO, I, .L_M1_N15
.align 5
.L_M1_N16:
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvst U0, P1, 0x00
xvst U1, P1, 0x20
PTR_ADDI S1, S1, 0x40
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M1_N16
.L_M1_N15:
andi I, N, 0x08
beq ZERO, I, .L_M1_N7
.L_M1_N8:
xvld U0, S1, 0x00
GST xv, , U0, P2, 0x00
PTR_ADDI S1, S1, 0x20
PTR_ADDI P2, P2, 0x20
.L_M1_N7:
andi I, N, 0x04
beq ZERO, I, .L_M1_N3
.L_M1_N4:
GLD v, , $vr0, S1, 0x00
GST v, , $vr0, P3, 0x00
PTR_ADDI S1, S1, 0x10
PTR_ADDI P3, P3, 0x10
.L_M1_N3:
andi I, N, 0x02
beq ZERO, I, .L_M1_N1
.L_M1_N2:
GLD f, d, $f0, S1, 0x00
GST f, d, $f0, P4, 0x00
PTR_ADDI S1, S1, 0x08
PTR_ADDI P4, P4, 0x08
.L_M1_N1:
andi I, N, 0x01
beq ZERO, I, .L_M0
GLD f, s, $f0, S1, 0x00
GST f, s, $f0, P5, 0x00
PTR_ADDI S1, S1, 0x04
PTR_ADDI P5, P5, 0x04
.L_M0:
pop_if_used 24, 8
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -0,0 +1,406 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/08/23 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*********************************************************************/
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S0 $r11
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define P0 $r20
#define P1 $r23
#define P2 $r24
#define P3 $r25
#define P4 $r26
#define T0 $r27
#define T1 $r28
#define TL $r7
#undef ZERO
#define ZERO $r0
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
// Loops outline
//.L_M8 <-------------------
//| .L_N8: |
//| .L_N7: | Main Loop
//| .L_N4: |
//| .L_N3: |
//| .L_N2: |
//| .L_N1: |
//| .L_N0: ---------------
//.L_M7
//.L_M4
//| .L_M4_N8:
//| .L_M4_N7:
//| .L_M4_N4:
//| .L_M4_N3:
//| .L_M4_N2:
//| .L_M4_N1:
//.L_M3
//.L_M2
//| .L_M2_N8:
//| .L_M2_N7:
//| .L_M2_N4:
//| .L_M2_N3:
//| .L_M2_N2:
//| .L_M2_N1:
//.L_M1
//| .L_M1_N8:
//| .L_M1_N7:
//| .L_M1_N4:
//| .L_M1_N3:
//| .L_M1_N2:
//| .L_M1_N1:
//.L_M0
PROLOGUE
push_if_used 23, 8
move S0, SRC
move P0, DST
PTR_SRAI T0, N, 0x04
PTR_SRAI T1, N, 0x03
PTR_SLLI T0, T0, 0x04
PTR_SLLI T1, T1, 0x03
PTR_MUL P2, M, T1
PTR_SLLI P2, P2, 0x02
PTR_ADD P2, DST, P2
PTR_SRAI T0, N, 0x02
PTR_SRAI T1, N, 0x01
PTR_SLLI T0, T0, 0x02
PTR_SLLI T1, T1, 0x01
PTR_MUL P3, M, T0
PTR_MUL P4, M, T1
PTR_SLLI P3, P3, 0x02
PTR_SLLI P4, P4, 0x02
PTR_ADD P3, DST, P3
PTR_ADD P4, DST, P4
PTR_SLLI TL, LDA, 0x02
PTR_SRAI J, M, 0x03
PTR_SLLI T0, TL, 0x01
PTR_SLLI T1, M, 0x05
beq ZERO, J, .L_M7
.align 5
.L_M8:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S3, S1, T0
PTR_ADD S4, S2, T0
PTR_ADD S5, S3, T0
PTR_ADD S6, S4, T0
PTR_ADD S7, S5, T0
PTR_ADD S8, S6, T0
PTR_ADD S0, S7, T0
move P1, P0
PTR_ADDI P0, P0, 0x100
PTR_SRAI I, N, 0x03
PTR_ADDI J, J, -1
beq ZERO, I, .L_N7
.L_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \
U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI S5, S5, 0x20
PTR_ADDI S6, S6, 0x20
PTR_ADDI S7, S7, 0x20
PTR_ADDI S8, S8, 0x20
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_N8
.L_N7:
andi I, N, 0x04
beq ZERO, I, .L_N3
.L_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \
$vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \
$vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI S5, S5, 0x10
PTR_ADDI S6, S6, 0x10
PTR_ADDI S7, S7, 0x10
PTR_ADDI S8, S8, 0x10
PTR_ADDI P2, P2, 0x80
.L_N3:
andi I, N, 0x02
beq ZERO, I, .L_N1
.L_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \
$f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI S3, S3, 0x08
PTR_ADDI S4, S4, 0x08
PTR_ADDI S5, S5, 0x08
PTR_ADDI S6, S6, 0x08
PTR_ADDI S7, S7, 0x08
PTR_ADDI S8, S8, 0x08
PTR_ADDI P3, P3, 0x40
.L_N1:
andi I, N, 0x01
beq ZERO, I, .L_N0
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \
$f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \
$f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI S3, S3, 0x04
PTR_ADDI S4, S4, 0x04
PTR_ADDI S5, S5, 0x04
PTR_ADDI S6, S6, 0x04
PTR_ADDI S7, S7, 0x04
PTR_ADDI S8, S8, 0x04
PTR_ADDI P4, P4, 0x20
.L_N0:
blt ZERO, J, .L_M8
.L_M7:
andi J, M, 0x04
beq ZERO, J, .L_M3
.L_M4:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S3, S1, T0
PTR_ADD S4, S2, T0
PTR_ADD S0, S3, T0
move P1, P0
PTR_ADDI P0, P0, 0x80
PTR_SRAI I, N, 0x03
beq ZERO, I, .L_M4_N7
.align 5
.L_M4_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI S3, S3, 0x20
PTR_ADDI S4, S4, 0x20
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M4_N8
.L_M4_N7:
andi I, N, 0x04
beq ZERO, I, .L_M4_N3
.L_M4_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI S3, S3, 0x10
PTR_ADDI S4, S4, 0x10
PTR_ADDI P2, P2, 0x40
.L_M4_N3:
andi I, N, 0x02
beq ZERO, I, .L_M4_N1
.L_M4_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI S3, S3, 0x08
PTR_ADDI S4, S4, 0x08
PTR_ADDI P3, P3, 0x20
.L_M4_N1:
andi I, N, 0x01
beq ZERO, I, .L_M3
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI S3, S3, 0x04
PTR_ADDI S4, S4, 0x04
PTR_ADDI P4, P4, 0x10
.L_M3:
andi J, M, 0x02
beq ZERO, J, .L_M1
.L_M2:
move S1, S0
PTR_ADD S2, S0, TL
PTR_ADD S0, S0, T0
move P1, P0
PTR_ADDI P0, P0, 0x40
PTR_SRAI I, N, 0x03
beq ZERO, I, .L_M2_N7
.align 5
.L_M2_N8:
xvld U0, S1, 0x00
xvld U1, S2, 0x00
GST xv, , U0, P1, 0x00, U1, P1, 0x20
PTR_ADDI S1, S1, 0x20
PTR_ADDI S2, S2, 0x20
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M2_N8
.L_M2_N7:
andi I, N, 0x04
beq ZERO, I, .L_M2_N3
.L_M2_N4:
GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00
GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10
PTR_ADDI S1, S1, 0x10
PTR_ADDI S2, S2, 0x10
PTR_ADDI P2, P2, 0x20
.L_M2_N3:
andi I, N, 0x02
beq ZERO, I, .L_M2_N1
.L_M2_N2:
GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00
GST f, d, $f0, P3, 0x00, $f1, P3, 0x08
PTR_ADDI S1, S1, 0x08
PTR_ADDI S2, S2, 0x08
PTR_ADDI P3, P3, 0x10
.L_M2_N1:
andi I, N, 0x01
beq ZERO, I, .L_M1
GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00
GST f, s, $f0, P4, 0x00, $f1, P4, 0x04
PTR_ADDI S1, S1, 0x04
PTR_ADDI S2, S2, 0x04
PTR_ADDI P4, P4, 0x08
.L_M1:
andi J, M, 0x01
beq ZERO, J, .L_M0
move S1, S0
PTR_ADD S2, S0, TL
move P1, P0
PTR_ADDI P0, P0, 0x20
PTR_SRAI I, N, 0x03
beq ZERO, I, .L_M1_N7
.align 5
.L_M1_N8:
xvld U0, S1, 0x00
GST xv, , U0, P1, 0x00
PTR_ADDI S1, S1, 0x20
PTR_ADDI I, I, -1
PTR_ADD P1, P1, T1
blt ZERO, I, .L_M1_N8
.L_M1_N7:
andi I, N, 0x04
beq ZERO, I, .L_M1_N3
.L_M1_N4:
GLD v, , $vr0, S1, 0x00
GST v, , $vr0, P2, 0x00
PTR_ADDI S1, S1, 0x10
PTR_ADDI P2, P2, 0x10
.L_M1_N3:
andi I, N, 0x02
beq ZERO, I, .L_M1_N1
.L_M1_N2:
GLD f, d, $f0, S1, 0x00
GST f, d, $f0, P3, 0x00
PTR_ADDI S1, S1, 0x08
PTR_ADDI P3, P3, 0x08
.L_M1_N1:
andi I, N, 0x01
beq ZERO, I, .L_M0
GLD f, s, $f0, S1, 0x00
GST f, s, $f0, P4, 0x00
PTR_ADDI S1, S1, 0x04
PTR_ADDI P4, P4, 0x04
.L_M0:
pop_if_used 23, 8
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov.d s2, s1 fmov.d s2, s1
bge $r0, N, .L999 bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999 beq $r0, INCX, .L999
srai.d I, N, 3 srai.d I, N, 3
bne INCX, TEMP, .L20 bne INCX, TEMP, .L20
bge $r0, I, .L15 bge $r0, I, .L15

View File

@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
MTC s1, $r0 MTC s1, $r0
bge $r0, N, .L999 bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999 beq $r0, INCX, .L999
move XX, X move XX, X
MOV s2, s1 MOV s2, s1
srai.d I, N, 2 srai.d I, N, 2

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0; FLOAT absxi = 0.0;
if (n <= 0 || inc_x <= 0) return(0.0); if (n <= 0 || inc_x == 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) ); if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x; n *= inc_x;

View File

@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2; BLASLONG inc_x2;
FLOAT temp; FLOAT temp;
if (n <= 0 || inc_x <= 0) return(0.0); if (n <= 0 || inc_x == 0) return(0.0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;

View File

@ -77,7 +77,7 @@
blez N, .L999 blez N, .L999
mov.d s2, s1 mov.d s2, s1
blez INCX, .L999 beqz INCX, .L999
dsll INCX, INCX, ZBASE_SHIFT dsll INCX, INCX, ZBASE_SHIFT
dsra I, N, 2 dsra I, N, 2

View File

@ -81,7 +81,7 @@
blez N, .L999 blez N, .L999
MTC $0, s1 MTC $0, s1
blez INCX, .L999 beqz INCX, .L999
dsll INCX, INCX, BASE_SHIFT dsll INCX, INCX, BASE_SHIFT
move XX, X move XX, X

View File

@ -77,7 +77,7 @@
blez N, .L999 blez N, .L999
mov.d s2, s1 mov.d s2, s1
blez INCX, .L999 beqz INCX, .L999
dsll INCX, INCX, BASE_SHIFT dsll INCX, INCX, BASE_SHIFT
bne INCX, TEMP, .L20 bne INCX, TEMP, .L20

View File

@ -80,7 +80,7 @@
blez N, .L999 blez N, .L999
MTC $0, s1 MTC $0, s1
blez INCX, .L999 beqz INCX, .L999
dsll INCX, INCX, ZBASE_SHIFT dsll INCX, INCX, ZBASE_SHIFT
move XX, X move XX, X

View File

@ -99,7 +99,7 @@
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble- LL(9999) ble- LL(9999)
cmpwi cr0, INCX, 0 cmpwi cr0, INCX, 0
ble- LL(9999) beq- LL(9999)
fmr f0, f1 fmr f0, f1
fmr f2, f1 fmr f2, f1

View File

@ -119,7 +119,7 @@
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble LL(99) ble LL(99)
cmpwi cr0, INCX, 0 cmpwi cr0, INCX, 0
ble LL(99) beq LL(99)
andi. r0, X, 2 * SIZE - 1 andi. r0, X, 2 * SIZE - 1
bne LL(100) bne LL(100)

View File

@ -104,7 +104,7 @@
cmpwi cr0, N, 0 cmpwi cr0, N, 0
ble- LL(999) ble- LL(999)
cmpwi cr0, INCX, 0 cmpwi cr0, INCX, 0
ble- LL(999) beq- LL(999)
fmr f0, f1 fmr f0, f1
sub X, X, INCX sub X, X, INCX

Some files were not shown because too many files have changed in this diff Show More