Merge branch 'develop' into issue4468
This commit is contained in:
70
.cirrus.yml
70
.cirrus.yml
@@ -1,44 +1,44 @@
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
#task:
|
||||
# name: AppleM1/LLVM
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/ILP64
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
#task:
|
||||
# name: AppleM1/LLVM/ILP64
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
|
||||
|
||||
task:
|
||||
name: AppleM1/LLVM/CMAKE
|
||||
compile_script:
|
||||
- brew install llvm
|
||||
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
- mkdir build
|
||||
- cd build
|
||||
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
- make -j 4
|
||||
#task:
|
||||
# name: AppleM1/LLVM/CMAKE
|
||||
# compile_script:
|
||||
# - brew install llvm
|
||||
# - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
# - mkdir build
|
||||
# - cd build
|
||||
# - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
|
||||
# - make -j 4
|
||||
|
||||
task:
|
||||
name: AppleM1/GCC/MAKE/OPENMP
|
||||
compile_script:
|
||||
- brew install gcc@11
|
||||
- export PATH=/opt/homebrew/bin:$PATH
|
||||
- export LDFLAGS="-L/opt/homebrew/lib"
|
||||
- export CPPFLAGS="-I/opt/homebrew/include"
|
||||
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
#task:
|
||||
# name: AppleM1/GCC/MAKE/OPENMP
|
||||
# compile_script:
|
||||
# - brew install gcc@11
|
||||
# - export PATH=/opt/homebrew/bin:$PATH
|
||||
# - export LDFLAGS="-L/opt/homebrew/lib"
|
||||
# - export CPPFLAGS="-I/opt/homebrew/include"
|
||||
# - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
|
||||
|
||||
macos_instance:
|
||||
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
|
||||
|
||||
149
.github/workflows/apple_m.yml
vendored
Normal file
149
.github/workflows/apple_m.yml
vendored
Normal file
@@ -0,0 +1,149 @@
|
||||
name: apple m
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
build:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: macos-14
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [cmake, make]
|
||||
fortran: [gfortran]
|
||||
openmp: [0, 1]
|
||||
ilp64: [0, 1]
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Print system information
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
cat /proc/cpuinfo
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
sysctl -a | grep machdep.cpu
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Install Dependencies
|
||||
run: |
|
||||
if [ "$RUNNER_OS" == "Linux" ]; then
|
||||
sudo apt-get install -y gfortran cmake ccache libtinfo5
|
||||
elif [ "$RUNNER_OS" == "macOS" ]; then
|
||||
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
|
||||
brew reinstall gcc
|
||||
brew install coreutils cmake ccache
|
||||
brew install llvm
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
# We include the commit sha in the cache key, as new cache entries are
|
||||
# only created if there is no existing entry for the key yet.
|
||||
# GNU make and cmake call the compilers differently. It looks like
|
||||
# that causes the cache to mismatch. Keep the ccache for both build
|
||||
# tools separate to avoid polluting each other.
|
||||
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
|
||||
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
|
||||
ccache-${{ runner.os }}-${{ matrix.build }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
if [ "${{ matrix.build }}" = "make" ]; then
|
||||
# Add ccache to path
|
||||
if [ "$RUNNER_OS" = "Linux" ]; then
|
||||
echo "/usr/lib/ccache" >> $GITHUB_PATH
|
||||
elif [ "$RUNNER_OS" = "macOS" ]; then
|
||||
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
|
||||
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
|
||||
echo "" >>$GITHUB_PATH
|
||||
else
|
||||
echo "::error::$RUNNER_OS not supported"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: Build OpenBLAS
|
||||
run: |
|
||||
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
|
||||
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
|
||||
export CC="/opt/homebrew/opt/llvm/bin/clang"
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
|
||||
;;
|
||||
"cmake")
|
||||
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
|
||||
mkdir build && cd build
|
||||
cmake -DDYNAMIC_ARCH=1 \
|
||||
-DUSE_OPENMP=${{matrix.openmp}} \
|
||||
-DINTERFACE64=${{matrix.ilp64}} \
|
||||
-DNOFORTRAN=0 \
|
||||
-DBUILD_WITHOUT_LAPACK=0 \
|
||||
-DCMAKE_VERBOSE_MAKEFILE=ON \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
|
||||
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
|
||||
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
|
||||
..
|
||||
cmake --build .
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
|
||||
- name: Show ccache status
|
||||
continue-on-error: true
|
||||
run: ccache -s
|
||||
|
||||
- name: Run tests
|
||||
timeout-minutes: 60
|
||||
run: |
|
||||
case "${{ matrix.build }}" in
|
||||
"make")
|
||||
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
|
||||
echo "::group::Tests in 'test' directory"
|
||||
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'ctest' directory"
|
||||
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
echo "::group::Tests in 'utest' directory"
|
||||
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
|
||||
echo "::endgroup::"
|
||||
;;
|
||||
"cmake")
|
||||
cd build && ctest
|
||||
;;
|
||||
*)
|
||||
echo "::error::Configuration not supported"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
253
.github/workflows/riscv64_vector.yml
vendored
Normal file
253
.github/workflows/riscv64_vector.yml
vendored
Normal file
@@ -0,0 +1,253 @@
|
||||
name: riscv64 zvl256b qemu test
|
||||
|
||||
on: [push, pull_request]
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: read # to fetch code (actions/checkout)
|
||||
|
||||
jobs:
|
||||
TEST:
|
||||
if: "github.repository == 'OpenMathLib/OpenBLAS'"
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
triple: riscv64-unknown-linux-gnu
|
||||
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
|
||||
riscv_gnu_toolchain_version: 13.2.0
|
||||
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- target: RISCV64_ZVL128B
|
||||
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
|
||||
- target: RISCV64_ZVL256B
|
||||
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: install build deps
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install autoconf automake autotools-dev ninja-build make \
|
||||
libgomp1-riscv64-cross ccache
|
||||
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
|
||||
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
|
||||
|
||||
- name: Compilation cache
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.ccache
|
||||
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
|
||||
restore-keys: |
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
|
||||
ccache-${{ runner.os }}-${{ matrix.target }}
|
||||
|
||||
- name: Configure ccache
|
||||
run: |
|
||||
test -d ~/.ccache || mkdir -p ~/.ccache
|
||||
echo "max_size = 300M" > ~/.ccache/ccache.conf
|
||||
echo "compression = true" >> ~/.ccache/ccache.conf
|
||||
ccache -s
|
||||
|
||||
- name: build OpenBLAS libs
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
|
||||
|
||||
- name: build OpenBLAS tests
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
|
||||
|
||||
- name: build lapack-netlib tests
|
||||
working-directory: ./lapack-netlib/TESTING
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
|
||||
CC='${triple}-gcc' \
|
||||
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
|
||||
RANLIB='ccache ${triple}-ranlib' \
|
||||
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
|
||||
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
|
||||
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
|
||||
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
|
||||
|
||||
- name: OpenBLAS tests
|
||||
shell: bash
|
||||
run: |
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
|
||||
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
|
||||
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
|
||||
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test test cblat1 &
|
||||
run_test test cblat2 cblat2.dat &
|
||||
run_test test cblat3 cblat3.dat &
|
||||
run_test test dblat1 &
|
||||
run_test test dblat2 dblat2.dat &
|
||||
run_test test dblat3 dblat3.dat &
|
||||
run_test test sblat1 &
|
||||
run_test test sblat2 sblat2.dat &
|
||||
run_test test sblat3 sblat3.dat &
|
||||
run_test test zblat1 &
|
||||
run_test test zblat2 zblat2.dat &
|
||||
run_test test zblat3 zblat3.dat &
|
||||
run_test ctest xccblat1 &
|
||||
run_test ctest xccblat2 cin2 &
|
||||
run_test ctest xccblat3 cin3 &
|
||||
run_test ctest xdcblat1 &
|
||||
run_test ctest xdcblat2 din2 &
|
||||
run_test ctest xdcblat3 din3 &
|
||||
run_test ctest xscblat1 &
|
||||
run_test ctest xscblat2 sin2 &
|
||||
run_test ctest xscblat3 sin3 &
|
||||
run_test ctest xzcblat1 &
|
||||
run_test ctest xzcblat2 zin2 &
|
||||
run_test ctest xzcblat3 zin3 &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
||||
|
||||
- name: netlib tests
|
||||
shell: bash
|
||||
run: |
|
||||
: # these take a very long time
|
||||
echo "Skipping netlib tests in CI"
|
||||
exit 0
|
||||
: # comment out exit above to enable the tests
|
||||
: # probably we want to identify a subset to run in CI
|
||||
export PATH="/opt/riscv/bin:$PATH"
|
||||
export QEMU_CPU=${{ matrix.qemu_cpu }}
|
||||
rm -rf ./test_out
|
||||
mkdir -p ./test_out
|
||||
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
|
||||
echo "$4" >> $OUTPUT; \
|
||||
echo "$CMD" >> $OUTPUT; \
|
||||
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
|
||||
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
|
||||
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
|
||||
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
|
||||
}
|
||||
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
|
||||
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
|
||||
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
|
||||
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
|
||||
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
|
||||
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
|
||||
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
|
||||
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
|
||||
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
|
||||
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
|
||||
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
|
||||
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
|
||||
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
|
||||
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
|
||||
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
|
||||
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
|
||||
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
|
||||
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
|
||||
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
|
||||
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
|
||||
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
|
||||
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
|
||||
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
|
||||
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
|
||||
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
|
||||
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
|
||||
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
|
||||
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
|
||||
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
|
||||
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
|
||||
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
|
||||
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
|
||||
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
|
||||
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
|
||||
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
|
||||
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
|
||||
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
|
||||
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
|
||||
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
|
||||
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
|
||||
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
|
||||
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
|
||||
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
|
||||
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
|
||||
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
|
||||
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
|
||||
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
|
||||
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
|
||||
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
|
||||
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
|
||||
wait
|
||||
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
|
||||
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
|
||||
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
|
||||
NUMERICAL_ERRORS=-1
|
||||
OTHER_ERRORS=-1
|
||||
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
|
||||
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
|
||||
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
|
||||
@@ -219,6 +219,7 @@ In chronological order:
|
||||
|
||||
* Mark Seminatore <https://github.com/mseminatore>
|
||||
* [2023-11-09] Improve Windows threading performance scaling
|
||||
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
|
||||
|
||||
* Dirreke <https://github.com/mseminatore>
|
||||
* [2024-01-16] Add basic support for the CSKY architecture
|
||||
|
||||
3
Makefile
3
Makefile
@@ -156,6 +156,9 @@ endif
|
||||
ifeq ($(OSNAME), CYGWIN_NT)
|
||||
@$(MAKE) -C exports dll
|
||||
endif
|
||||
ifeq ($(OSNAME), AIX)
|
||||
@$(MAKE) -C exports so
|
||||
endif
|
||||
endif
|
||||
|
||||
tests : shared
|
||||
|
||||
@@ -1715,11 +1715,7 @@ endif
|
||||
|
||||
LIBDLLNAME = $(LIBPREFIX).dll
|
||||
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
|
||||
ifneq ($(OSNAME), AIX)
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
|
||||
else
|
||||
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
|
||||
endif
|
||||
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
|
||||
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
|
||||
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)
|
||||
|
||||
10
README.md
10
README.md
@@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
|
||||
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
|
||||
```
|
||||
|
||||
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
|
||||
e.g.:
|
||||
```sh
|
||||
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
|
||||
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
|
||||
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
|
||||
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
|
||||
HOSTCC=gcc HOSTFC=gfortran -j
|
||||
```
|
||||
|
||||
### Support for multiple targets in a single library
|
||||
|
||||
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
|
||||
|
||||
@@ -64,6 +64,7 @@ else ()
|
||||
"#define NEEDBUNDERSCORE 1\n")
|
||||
endif()
|
||||
|
||||
if (CMAKE_Fortran_COMPILER)
|
||||
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
|
||||
string(TOUPPER ${F_COMPILER} F_COMPILER)
|
||||
|
||||
endif()
|
||||
|
||||
@@ -6,9 +6,6 @@
|
||||
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
|
||||
# This is for classic Flang. LLVM Flang is handled with gfortran below.
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
|
||||
if (BINARY64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
|
||||
endif ()
|
||||
if (USE_OPENMP)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
|
||||
endif ()
|
||||
@@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
||||
if (MIPS64)
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
|
||||
if (INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
|
||||
endif ()
|
||||
@@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
|
||||
endif ()
|
||||
endif ()
|
||||
endif ()
|
||||
if (ARM64 AND INTERFACE64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
|
||||
endif ()
|
||||
else ()
|
||||
if (BINARY64)
|
||||
set(FCOMMON_OPT "${FCOMMON_OPT} -m64")
|
||||
|
||||
@@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
|
||||
# include <riscv_vector.h>
|
||||
#endif
|
||||
|
||||
|
||||
@@ -40,6 +40,10 @@ else()
|
||||
c_${float_char}blas1.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat1 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat1 m)
|
||||
endif()
|
||||
@@ -65,6 +69,10 @@ else()
|
||||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat2 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat2 m)
|
||||
endif()
|
||||
@@ -90,6 +98,10 @@ else()
|
||||
constant.c)
|
||||
endif()
|
||||
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(x${float_char}cblat3 omp pthread)
|
||||
endif()
|
||||
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
|
||||
target_link_libraries(x${float_char}cblat3 m)
|
||||
endif()
|
||||
|
||||
@@ -48,6 +48,12 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
|
||||
#else
|
||||
# define MT_TRACE(...)
|
||||
#endif
|
||||
|
||||
/* This is a thread implementation for Win32 lazy implementation */
|
||||
|
||||
/* Thread server common information */
|
||||
@@ -68,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
|
||||
static DWORD blas_threads_id[MAX_CPU_NUMBER];
|
||||
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
|
||||
|
||||
#if defined (__GNUC__) && (__GNUC__ < 6)
|
||||
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch)
|
||||
#else
|
||||
#if defined(_WIN64)
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
|
||||
#else
|
||||
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
|
||||
#endif
|
||||
#endif
|
||||
//
|
||||
// Legacy code path
|
||||
//
|
||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
|
||||
|
||||
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
|
||||
if (!(mode & BLAS_COMPLEX)){
|
||||
if (!(mode & BLAS_COMPLEX)) {
|
||||
#ifdef EXPRECISION
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
/* REAL / Extended Double */
|
||||
@@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> c, args -> ldc, sb);
|
||||
} else
|
||||
#endif
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
/* REAL / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
@@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
|
||||
/* REAL / Single */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG,
|
||||
@@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
#ifdef BUILD_BFLOAT16
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
|
||||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
@@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
|
||||
/* REAL / BLAS_STOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
@@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
|
||||
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
|
||||
/* REAL / BLAS_DTOBF16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
|
||||
double *, BLASLONG, bfloat16 *, BLASLONG,
|
||||
@@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
}
|
||||
} else {
|
||||
#ifdef EXPRECISION
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
|
||||
/* COMPLEX / Extended Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
|
||||
xdouble *, BLASLONG, xdouble *, BLASLONG,
|
||||
@@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
args -> c, args -> ldc, sb);
|
||||
} else
|
||||
#endif
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
/* COMPLEX / Double */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
|
||||
double *, BLASLONG, double *, BLASLONG,
|
||||
@@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
||||
}
|
||||
}
|
||||
|
||||
/* This is a main routine of threads. Each thread waits until job is */
|
||||
/* queued. */
|
||||
|
||||
static DWORD WINAPI blas_thread_server(void *arg){
|
||||
//
|
||||
// This is a main routine of threads. Each thread waits until job is queued.
|
||||
//
|
||||
static DWORD WINAPI blas_thread_server(void *arg) {
|
||||
|
||||
/* Thread identifier */
|
||||
BLASLONG cpu = (BLASLONG)arg;
|
||||
@@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||
/* Each server needs each buffer */
|
||||
buffer = blas_memory_alloc(2);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
|
||||
|
||||
while (1){
|
||||
while (1) {
|
||||
|
||||
/* Waiting for Queue */
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
|
||||
#endif
|
||||
// event raised when work is added to the queue
|
||||
WaitForSingleObject(kickoff_event, INFINITE);
|
||||
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
|
||||
|
||||
if (cpu > thread_target - 2)
|
||||
{
|
||||
//printf("thread [%d] exiting.\n", cpu);
|
||||
break; // excess thread, so worker thread exits
|
||||
}
|
||||
// event raised when work is added to the queue
|
||||
WaitForSingleObject(kickoff_event, INFINITE);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
|
||||
#endif
|
||||
if (cpu > thread_target - 2) {
|
||||
//MT_TRACE("thread [%d] exiting.\n", cpu);
|
||||
break; // excess thread, so worker thread exits
|
||||
}
|
||||
|
||||
MT_TRACE("Server[%2ld] Got it.\n", cpu);
|
||||
|
||||
#if 1
|
||||
EnterCriticalSection(&queue_lock);
|
||||
|
||||
queue = work_queue;
|
||||
@@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||
work_queue = work_queue->next;
|
||||
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
#else
|
||||
volatile blas_queue_t* queue_next;
|
||||
|
||||
INT_PTR prev_value;
|
||||
do {
|
||||
queue = (volatile blas_queue_t*)work_queue;
|
||||
if (!queue)
|
||||
break;
|
||||
|
||||
queue_next = (volatile blas_queue_t*)queue->next;
|
||||
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
|
||||
} while (prev_value != queue);
|
||||
#endif
|
||||
|
||||
if (queue) {
|
||||
if (queue) {
|
||||
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
|
||||
|
||||
sa = queue -> sa;
|
||||
sb = queue -> sb;
|
||||
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
#ifdef CONSISTENT_FPCSR
|
||||
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
|
||||
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
|
||||
#endif
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
||||
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
|
||||
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
|
||||
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING1;
|
||||
#endif
|
||||
|
||||
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
if (sa == NULL)
|
||||
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
|
||||
|
||||
if (sb == NULL) {
|
||||
if (!(queue -> mode & BLAS_COMPLEX)){
|
||||
if (!(queue -> mode & BLAS_COMPLEX)) {
|
||||
#ifdef EXPRECISION
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
|
||||
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
} else
|
||||
#endif
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
|
||||
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
|
||||
#ifdef BUILD_DOUBLE
|
||||
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
|
||||
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
|
||||
@@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){
|
||||
/* Other types in future */
|
||||
}
|
||||
}
|
||||
queue->sb=sb;
|
||||
queue->sb=sb;
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING2;
|
||||
#endif
|
||||
#ifdef MONITOR
|
||||
main_status[cpu] = MAIN_RUNNING2;
|
||||
#endif
|
||||
|
||||
if (!(queue -> mode & BLAS_LEGACY)) {
|
||||
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
|
||||
} else {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
legacy_exec(routine, queue -> mode, queue -> args, sb);
|
||||
}
|
||||
}else{
|
||||
continue; //if queue == NULL
|
||||
}
|
||||
} else {
|
||||
continue; //if queue == NULL
|
||||
}
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Finished!\n", cpu);
|
||||
|
||||
queue->finished = 1;
|
||||
|
||||
queue->finished = 1;
|
||||
}
|
||||
|
||||
/* Shutdown procedure */
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
|
||||
#endif
|
||||
MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
|
||||
|
||||
blas_memory_free(buffer);
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Initializing routine */
|
||||
int blas_thread_init(void){
|
||||
//
|
||||
// Initializing routine
|
||||
//
|
||||
int blas_thread_init(void) {
|
||||
BLASLONG i;
|
||||
|
||||
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
|
||||
blas_cpu_number);
|
||||
#endif
|
||||
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
|
||||
|
||||
if (!blas_server_avail){
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
if (!blas_server_avail) {
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
thread_target = blas_cpu_number;
|
||||
thread_target = blas_cpu_number;
|
||||
|
||||
InitializeCriticalSection(&queue_lock);
|
||||
|
||||
for(i = 0; i < blas_cpu_number - 1; i++){
|
||||
//printf("thread_init: creating thread [%d]\n", i);
|
||||
for(i = 0; i < blas_cpu_number - 1; i++) {
|
||||
//MT_TRACE("thread_init: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
@@ -400,15 +371,12 @@ int blas_thread_init(void){
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
User can call one of two routines.
|
||||
|
||||
exec_blas_async ... immediately returns after jobs are queued.
|
||||
|
||||
exec_blas ... returns after jobs are finished.
|
||||
*/
|
||||
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
//
|
||||
// User can call one of two routines.
|
||||
// exec_blas_async ... immediately returns after jobs are queued.
|
||||
// exec_blas ... returns after jobs are finished.
|
||||
//
|
||||
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
|
||||
|
||||
#if defined(SMP_SERVER)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
@@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
|
||||
#endif
|
||||
|
||||
current->finished = 0;
|
||||
current->finished = 0;
|
||||
current = current -> next;
|
||||
pos ++;
|
||||
}
|
||||
@@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
|
||||
if (!work_queue)
|
||||
{
|
||||
work_queue = queue;
|
||||
work_queue = queue;
|
||||
}
|
||||
else
|
||||
{
|
||||
blas_queue_t *next_item = work_queue;
|
||||
|
||||
// find the end of the work queue
|
||||
while (next_item)
|
||||
next_item = next_item->next;
|
||||
// find the end of the work queue
|
||||
while (next_item)
|
||||
next_item = next_item->next;
|
||||
|
||||
// add new work to the end
|
||||
next_item = queue;
|
||||
// add new work to the end
|
||||
next_item = queue;
|
||||
}
|
||||
|
||||
LeaveCriticalSection(&queue_lock);
|
||||
@@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
|
||||
return 0;
|
||||
}
|
||||
|
||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||
//
|
||||
// Join. Wait for all queued tasks to complete
|
||||
//
|
||||
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Synchronization Waiting.\n");
|
||||
#endif
|
||||
MT_TRACE("Synchronization Waiting.\n");
|
||||
|
||||
while (num){
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Waiting Queue ..\n");
|
||||
#endif
|
||||
while (!queue->finished)
|
||||
YIELDING;
|
||||
while (num) {
|
||||
MT_TRACE("Waiting Queue ..\n");
|
||||
|
||||
queue = queue->next;
|
||||
num--;
|
||||
}
|
||||
while (!queue->finished)
|
||||
YIELDING;
|
||||
|
||||
queue = queue->next;
|
||||
num--;
|
||||
}
|
||||
|
||||
MT_TRACE("Completely Done.\n\n");
|
||||
|
||||
#ifdef SMP_DEBUG
|
||||
fprintf(STDERR, "Completely Done.\n\n");
|
||||
#endif
|
||||
// if work was added to the queue after this batch we can't sleep the worker threads
|
||||
// by resetting the event
|
||||
EnterCriticalSection(&queue_lock);
|
||||
@@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Execute Threads */
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
//
|
||||
// Execute Threads
|
||||
//
|
||||
int exec_blas(BLASLONG num, blas_queue_t *queue) {
|
||||
|
||||
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
|
||||
// Handle lazy re-init of the thread-pool after a POSIX fork
|
||||
@@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
||||
|
||||
if ((num <= 0) || (queue == NULL)) return 0;
|
||||
|
||||
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
|
||||
if ((num > 1) && queue -> next)
|
||||
exec_blas_async(1, queue -> next);
|
||||
|
||||
routine = queue -> routine;
|
||||
|
||||
if (queue -> mode & BLAS_LEGACY) {
|
||||
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
|
||||
} else
|
||||
} else {
|
||||
if (queue -> mode & BLAS_PTHREAD) {
|
||||
void (*pthreadcompat)(void *) = queue -> routine;
|
||||
(pthreadcompat)(queue -> args);
|
||||
} else
|
||||
(routine)(queue -> args, queue -> range_m, queue -> range_n,
|
||||
queue -> sa, queue -> sb, 0);
|
||||
queue -> sa, queue -> sb, 0);
|
||||
}
|
||||
|
||||
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
|
||||
if ((num > 1) && queue -> next)
|
||||
exec_blas_async_wait(num - 1, queue -> next);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Shutdown procedure, but user don't have to call this routine. The */
|
||||
/* kernel automatically kill threads. */
|
||||
|
||||
int BLASFUNC(blas_thread_shutdown)(void){
|
||||
//
|
||||
// Shutdown procedure, but user don't have to call this routine. The
|
||||
// kernel automatically kill threads.
|
||||
//
|
||||
int BLASFUNC(blas_thread_shutdown)(void) {
|
||||
|
||||
int i;
|
||||
|
||||
@@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
if (blas_server_avail){
|
||||
if (blas_server_avail) {
|
||||
|
||||
for(i = 0; i < blas_num_threads - 1; i++){
|
||||
for (i = 0; i < blas_num_threads - 1; i++) {
|
||||
// Could also just use WaitForMultipleObjects
|
||||
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
|
||||
|
||||
@@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
||||
return 0;
|
||||
}
|
||||
|
||||
//
|
||||
// Legacy function to set numbef of threads
|
||||
//
|
||||
void goto_set_num_threads(int num_threads)
|
||||
{
|
||||
long i;
|
||||
@@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads)
|
||||
|
||||
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
|
||||
|
||||
if (blas_server_avail && num_threads < blas_num_threads) {
|
||||
if (blas_server_avail && num_threads < blas_num_threads) {
|
||||
LOCK_COMMAND(&server_lock);
|
||||
|
||||
thread_target = num_threads;
|
||||
@@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads)
|
||||
SetEvent(kickoff_event);
|
||||
|
||||
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
|
||||
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
|
||||
|
||||
WaitForSingleObject(blas_threads[i], INFINITE);
|
||||
|
||||
//printf("set_num_threads: thread [%d] has quit.\n", i);
|
||||
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
|
||||
|
||||
CloseHandle(blas_threads[i]);
|
||||
}
|
||||
@@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads)
|
||||
|
||||
thread_target = num_threads;
|
||||
|
||||
//increased_threads = 1;
|
||||
if (!blas_server_avail){
|
||||
//increased_threads = 1;
|
||||
if (!blas_server_avail) {
|
||||
// create the kickoff Event
|
||||
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
|
||||
|
||||
@@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads)
|
||||
blas_server_avail = 1;
|
||||
}
|
||||
|
||||
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
|
||||
//printf("set_num_threads: creating thread [%d]\n", i);
|
||||
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
|
||||
//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
|
||||
|
||||
blas_threads[i] = CreateThread(NULL, 0,
|
||||
blas_thread_server, (void *)i,
|
||||
@@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads)
|
||||
blas_cpu_number = num_threads;
|
||||
}
|
||||
|
||||
//
|
||||
// Openblas function to set thread count
|
||||
//
|
||||
void openblas_set_num_threads(int num)
|
||||
{
|
||||
goto_set_num_threads(num);
|
||||
|
||||
@@ -73,6 +73,10 @@ endif
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
|
||||
EXTRALIB += -lxlf90
|
||||
endif
|
||||
|
||||
ifeq ($(C_COMPILER), PGI)
|
||||
EXTRALIB += -pgf90libs
|
||||
endif
|
||||
@@ -256,6 +260,20 @@ endif
|
||||
|
||||
ifeq ($(OSNAME), AIX)
|
||||
|
||||
so : ../$(LIBSONAME) linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
../$(LIBSONAME) : aix.exp
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
|
||||
-Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
|
||||
|
||||
aix.exp :
|
||||
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|
||||
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
|
||||
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
|
||||
/usr/bin/sort -u > aix.exp
|
||||
|
||||
ifeq ($(COMPILER_F77), xlf)
|
||||
|
||||
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def
|
||||
|
||||
10
getarch.c
10
getarch.c
@@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#include <sys/sysinfo.h>
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#if defined(AIX)
|
||||
#if defined(_AIX)
|
||||
#include <unistd.h>
|
||||
#include <sys/systemcfg.h>
|
||||
#include <sys/sysinfo.h>
|
||||
#endif
|
||||
|
||||
@@ -1870,11 +1872,13 @@ static int get_num_cores(void) {
|
||||
|
||||
return count;
|
||||
|
||||
#elif defined(AIX)
|
||||
#elif defined(_AIX)
|
||||
//returns the number of processors which are currently online
|
||||
count = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
if (count <= 0) count = 2;
|
||||
|
||||
|
||||
return count;
|
||||
|
||||
#else
|
||||
return 2;
|
||||
#endif
|
||||
|
||||
1170
kernel/generic/zhemm_ltcopy_16.c
Normal file
1170
kernel/generic/zhemm_ltcopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
1168
kernel/generic/zhemm_utcopy_16.c
Normal file
1168
kernel/generic/zhemm_utcopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
587
kernel/generic/zneg_tcopy_16.c
Normal file
587
kernel/generic/zneg_tcopy_16.c
Normal file
@@ -0,0 +1,587 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
||||
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *aoffset;
|
||||
FLOAT *aoffset1, *aoffset2;
|
||||
|
||||
FLOAT *boffset;
|
||||
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||
|
||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||
|
||||
aoffset = a;
|
||||
boffset = b;
|
||||
lda *= 2;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "M = %d N = %d\n", m, n);
|
||||
#endif
|
||||
|
||||
j = (n >> 4);
|
||||
if (j > 0){
|
||||
do{
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 32;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
ctemp17 = *(aoffset1 + 16);
|
||||
ctemp18 = *(aoffset1 + 17);
|
||||
ctemp19 = *(aoffset1 + 18);
|
||||
ctemp20 = *(aoffset1 + 19);
|
||||
ctemp21 = *(aoffset1 + 20);
|
||||
ctemp22 = *(aoffset1 + 21);
|
||||
ctemp23 = *(aoffset1 + 22);
|
||||
ctemp24 = *(aoffset1 + 23);
|
||||
ctemp25 = *(aoffset1 + 24);
|
||||
ctemp26 = *(aoffset1 + 25);
|
||||
ctemp27 = *(aoffset1 + 26);
|
||||
ctemp28 = *(aoffset1 + 27);
|
||||
ctemp29 = *(aoffset1 + 28);
|
||||
ctemp30 = *(aoffset1 + 29);
|
||||
ctemp31 = *(aoffset1 + 30);
|
||||
ctemp32 = *(aoffset1 + 31);
|
||||
|
||||
ctemp33 = *(aoffset2 + 0);
|
||||
ctemp34 = *(aoffset2 + 1);
|
||||
ctemp35 = *(aoffset2 + 2);
|
||||
ctemp36 = *(aoffset2 + 3);
|
||||
ctemp37 = *(aoffset2 + 4);
|
||||
ctemp38 = *(aoffset2 + 5);
|
||||
ctemp39 = *(aoffset2 + 6);
|
||||
ctemp40 = *(aoffset2 + 7);
|
||||
ctemp41 = *(aoffset2 + 8);
|
||||
ctemp42 = *(aoffset2 + 9);
|
||||
ctemp43 = *(aoffset2 + 10);
|
||||
ctemp44 = *(aoffset2 + 11);
|
||||
ctemp45 = *(aoffset2 + 12);
|
||||
ctemp46 = *(aoffset2 + 13);
|
||||
ctemp47 = *(aoffset2 + 14);
|
||||
ctemp48 = *(aoffset2 + 15);
|
||||
ctemp49 = *(aoffset2 + 16);
|
||||
ctemp50 = *(aoffset2 + 17);
|
||||
ctemp51 = *(aoffset2 + 18);
|
||||
ctemp52 = *(aoffset2 + 19);
|
||||
ctemp53 = *(aoffset2 + 20);
|
||||
ctemp54 = *(aoffset2 + 21);
|
||||
ctemp55 = *(aoffset2 + 22);
|
||||
ctemp56 = *(aoffset2 + 23);
|
||||
ctemp57 = *(aoffset2 + 24);
|
||||
ctemp58 = *(aoffset2 + 25);
|
||||
ctemp59 = *(aoffset2 + 26);
|
||||
ctemp60 = *(aoffset2 + 27);
|
||||
ctemp61 = *(aoffset2 + 28);
|
||||
ctemp62 = *(aoffset2 + 29);
|
||||
ctemp63 = *(aoffset2 + 30);
|
||||
ctemp64 = *(aoffset2 + 31);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
*(boffset + 32) = -ctemp33;
|
||||
*(boffset + 33) = -ctemp34;
|
||||
*(boffset + 34) = -ctemp35;
|
||||
*(boffset + 35) = -ctemp36;
|
||||
*(boffset + 36) = -ctemp37;
|
||||
*(boffset + 37) = -ctemp38;
|
||||
*(boffset + 38) = -ctemp39;
|
||||
*(boffset + 39) = -ctemp40;
|
||||
|
||||
*(boffset + 40) = -ctemp41;
|
||||
*(boffset + 41) = -ctemp42;
|
||||
*(boffset + 42) = -ctemp43;
|
||||
*(boffset + 43) = -ctemp44;
|
||||
*(boffset + 44) = -ctemp45;
|
||||
*(boffset + 45) = -ctemp46;
|
||||
*(boffset + 46) = -ctemp47;
|
||||
*(boffset + 47) = -ctemp48;
|
||||
|
||||
*(boffset + 48) = -ctemp49;
|
||||
*(boffset + 49) = -ctemp50;
|
||||
*(boffset + 50) = -ctemp51;
|
||||
*(boffset + 51) = -ctemp52;
|
||||
*(boffset + 52) = -ctemp53;
|
||||
*(boffset + 53) = -ctemp54;
|
||||
*(boffset + 54) = -ctemp55;
|
||||
*(boffset + 55) = -ctemp56;
|
||||
|
||||
*(boffset + 56) = -ctemp57;
|
||||
*(boffset + 57) = -ctemp58;
|
||||
*(boffset + 58) = -ctemp59;
|
||||
*(boffset + 59) = -ctemp60;
|
||||
*(boffset + 60) = -ctemp61;
|
||||
*(boffset + 61) = -ctemp62;
|
||||
*(boffset + 62) = -ctemp63;
|
||||
*(boffset + 63) = -ctemp64;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 64;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
ctemp17 = *(aoffset1 + 16);
|
||||
ctemp18 = *(aoffset1 + 17);
|
||||
ctemp19 = *(aoffset1 + 18);
|
||||
ctemp20 = *(aoffset1 + 19);
|
||||
ctemp21 = *(aoffset1 + 20);
|
||||
ctemp22 = *(aoffset1 + 21);
|
||||
ctemp23 = *(aoffset1 + 22);
|
||||
ctemp24 = *(aoffset1 + 23);
|
||||
ctemp25 = *(aoffset1 + 24);
|
||||
ctemp26 = *(aoffset1 + 25);
|
||||
ctemp27 = *(aoffset1 + 26);
|
||||
ctemp28 = *(aoffset1 + 27);
|
||||
ctemp29 = *(aoffset1 + 28);
|
||||
ctemp30 = *(aoffset1 + 29);
|
||||
ctemp31 = *(aoffset1 + 30);
|
||||
ctemp32 = *(aoffset1 + 31);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
boffset += 32;
|
||||
}
|
||||
|
||||
j--;
|
||||
}while(j > 0);
|
||||
} /* end of if(j > 0) */
|
||||
|
||||
if (n & 8){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 16;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
|
||||
ctemp17 = *(aoffset2 + 0);
|
||||
ctemp18 = *(aoffset2 + 1);
|
||||
ctemp19 = *(aoffset2 + 2);
|
||||
ctemp20 = *(aoffset2 + 3);
|
||||
ctemp21 = *(aoffset2 + 4);
|
||||
ctemp22 = *(aoffset2 + 5);
|
||||
ctemp23 = *(aoffset2 + 6);
|
||||
ctemp24 = *(aoffset2 + 7);
|
||||
ctemp25 = *(aoffset2 + 8);
|
||||
ctemp26 = *(aoffset2 + 9);
|
||||
ctemp27 = *(aoffset2 + 10);
|
||||
ctemp28 = *(aoffset2 + 11);
|
||||
ctemp29 = *(aoffset2 + 12);
|
||||
ctemp30 = *(aoffset2 + 13);
|
||||
ctemp31 = *(aoffset2 + 14);
|
||||
ctemp32 = *(aoffset2 + 15);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
*(boffset + 16) = -ctemp17;
|
||||
*(boffset + 17) = -ctemp18;
|
||||
*(boffset + 18) = -ctemp19;
|
||||
*(boffset + 19) = -ctemp20;
|
||||
*(boffset + 20) = -ctemp21;
|
||||
*(boffset + 21) = -ctemp22;
|
||||
*(boffset + 22) = -ctemp23;
|
||||
*(boffset + 23) = -ctemp24;
|
||||
|
||||
*(boffset + 24) = -ctemp25;
|
||||
*(boffset + 25) = -ctemp26;
|
||||
*(boffset + 26) = -ctemp27;
|
||||
*(boffset + 27) = -ctemp28;
|
||||
*(boffset + 28) = -ctemp29;
|
||||
*(boffset + 29) = -ctemp30;
|
||||
*(boffset + 30) = -ctemp31;
|
||||
*(boffset + 31) = -ctemp32;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 32;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
ctemp09 = *(aoffset1 + 8);
|
||||
ctemp10 = *(aoffset1 + 9);
|
||||
ctemp11 = *(aoffset1 + 10);
|
||||
ctemp12 = *(aoffset1 + 11);
|
||||
ctemp13 = *(aoffset1 + 12);
|
||||
ctemp14 = *(aoffset1 + 13);
|
||||
ctemp15 = *(aoffset1 + 14);
|
||||
ctemp16 = *(aoffset1 + 15);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
boffset += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 8;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
|
||||
ctemp09 = *(aoffset2 + 0);
|
||||
ctemp10 = *(aoffset2 + 1);
|
||||
ctemp11 = *(aoffset2 + 2);
|
||||
ctemp12 = *(aoffset2 + 3);
|
||||
ctemp13 = *(aoffset2 + 4);
|
||||
ctemp14 = *(aoffset2 + 5);
|
||||
ctemp15 = *(aoffset2 + 6);
|
||||
ctemp16 = *(aoffset2 + 7);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
*(boffset + 8) = -ctemp09;
|
||||
*(boffset + 9) = -ctemp10;
|
||||
*(boffset + 10) = -ctemp11;
|
||||
*(boffset + 11) = -ctemp12;
|
||||
*(boffset + 12) = -ctemp13;
|
||||
*(boffset + 13) = -ctemp14;
|
||||
*(boffset + 14) = -ctemp15;
|
||||
*(boffset + 15) = -ctemp16;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 16;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
ctemp05 = *(aoffset1 + 4);
|
||||
ctemp06 = *(aoffset1 + 5);
|
||||
ctemp07 = *(aoffset1 + 6);
|
||||
ctemp08 = *(aoffset1 + 7);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
boffset += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
aoffset += 4;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
ctemp05 = *(aoffset2 + 0);
|
||||
ctemp06 = *(aoffset2 + 1);
|
||||
ctemp07 = *(aoffset2 + 2);
|
||||
ctemp08 = *(aoffset2 + 3);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
*(boffset + 4) = -ctemp05;
|
||||
*(boffset + 5) = -ctemp06;
|
||||
*(boffset + 6) = -ctemp07;
|
||||
*(boffset + 7) = -ctemp08;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 8;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset1 + 2);
|
||||
ctemp04 = *(aoffset1 + 3);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
|
||||
boffset += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset + lda;
|
||||
// aoffset += 2;
|
||||
|
||||
i = (m >> 1);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
ctemp03 = *(aoffset2 + 0);
|
||||
ctemp04 = *(aoffset2 + 1);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
*(boffset + 2) = -ctemp03;
|
||||
*(boffset + 3) = -ctemp04;
|
||||
|
||||
aoffset1 += 2 * lda;
|
||||
aoffset2 += 2 * lda;
|
||||
boffset += 4;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
ctemp01 = *(aoffset1 + 0);
|
||||
ctemp02 = *(aoffset1 + 1);
|
||||
|
||||
*(boffset + 0) = -ctemp01;
|
||||
*(boffset + 1) = -ctemp02;
|
||||
// boffset += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
333
kernel/generic/zsymm_lcopy_16.c
Normal file
333
kernel/generic/zsymm_lcopy_16.c
Normal file
@@ -0,0 +1,333 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
|
||||
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
|
||||
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
|
||||
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
|
||||
|
||||
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
|
||||
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
js = (n >> 4);
|
||||
while (js > 0){
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
|
||||
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
|
||||
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
|
||||
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
|
||||
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
|
||||
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
|
||||
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
|
||||
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
|
||||
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
data17 = *(ao9 + 0);
|
||||
data18 = *(ao9 + 1);
|
||||
data19 = *(ao10 + 0);
|
||||
data20 = *(ao10 + 1);
|
||||
data21 = *(ao11 + 0);
|
||||
data22 = *(ao11 + 1);
|
||||
data23 = *(ao12 + 0);
|
||||
data24 = *(ao12 + 1);
|
||||
data25 = *(ao13 + 0);
|
||||
data26 = *(ao13 + 1);
|
||||
data27 = *(ao14 + 0);
|
||||
data28 = *(ao14 + 1);
|
||||
data29 = *(ao15 + 0);
|
||||
data30 = *(ao15 + 1);
|
||||
data31 = *(ao16 + 0);
|
||||
data32 = *(ao16 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||
if (offset > -8) ao9 += lda; else ao9 += 2;
|
||||
if (offset > -9) ao10 += lda; else ao10 += 2;
|
||||
if (offset > -10) ao11 += lda; else ao11 += 2;
|
||||
if (offset > -11) ao12 += lda; else ao12 += 2;
|
||||
if (offset > -12) ao13 += lda; else ao13 += 2;
|
||||
if (offset > -13) ao14 += lda; else ao14 += 2;
|
||||
if (offset > -14) ao15 += lda; else ao15 += 2;
|
||||
if (offset > -15) ao16 += lda; else ao16 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
b[16] = data17;
|
||||
b[17] = data18;
|
||||
b[18] = data19;
|
||||
b[19] = data20;
|
||||
b[20] = data21;
|
||||
b[21] = data22;
|
||||
b[22] = data23;
|
||||
b[23] = data24;
|
||||
b[24] = data25;
|
||||
b[25] = data26;
|
||||
b[26] = data27;
|
||||
b[27] = data28;
|
||||
b[28] = data29;
|
||||
b[29] = data30;
|
||||
b[30] = data31;
|
||||
b[31] = data32;
|
||||
|
||||
b += 32;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 16;
|
||||
js --;
|
||||
}
|
||||
|
||||
if (n & 8) {
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
|
||||
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
|
||||
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
|
||||
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
if (offset > -4) ao5 += lda; else ao5 += 2;
|
||||
if (offset > -5) ao6 += lda; else ao6 += 2;
|
||||
if (offset > -6) ao7 += lda; else ao7 += 2;
|
||||
if (offset > -7) ao8 += lda; else ao8 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
|
||||
b += 16;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
|
||||
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
if (offset > -2) ao3 += lda; else ao3 += 2;
|
||||
if (offset > -3) ao4 += lda; else ao4 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
|
||||
b += 8;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
if (offset > -1) ao2 += lda; else ao2 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
|
||||
b += 4;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 2;
|
||||
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
if (offset > 0) ao1 += lda; else ao1 += 2;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
|
||||
b += 2;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
332
kernel/generic/zsymm_ucopy_16.c
Normal file
332
kernel/generic/zsymm_ucopy_16.c
Normal file
@@ -0,0 +1,332 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
|
||||
|
||||
BLASLONG i, js, offset;
|
||||
|
||||
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
|
||||
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
|
||||
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
|
||||
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
|
||||
|
||||
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
|
||||
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
js = (n >> 4);
|
||||
while (js > 0){
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
|
||||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
|
||||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
|
||||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
|
||||
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
|
||||
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
|
||||
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
|
||||
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
|
||||
if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
|
||||
if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
|
||||
if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
|
||||
if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
|
||||
if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
|
||||
if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
|
||||
if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
|
||||
if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
data17 = *(ao9 + 0);
|
||||
data18 = *(ao9 + 1);
|
||||
data19 = *(ao10 + 0);
|
||||
data20 = *(ao10 + 1);
|
||||
data21 = *(ao11 + 0);
|
||||
data22 = *(ao11 + 1);
|
||||
data23 = *(ao12 + 0);
|
||||
data24 = *(ao12 + 1);
|
||||
data25 = *(ao13 + 0);
|
||||
data26 = *(ao13 + 1);
|
||||
data27 = *(ao14 + 0);
|
||||
data28 = *(ao14 + 1);
|
||||
data29 = *(ao15 + 0);
|
||||
data30 = *(ao15 + 1);
|
||||
data31 = *(ao16 + 0);
|
||||
data32 = *(ao16 + 1);
|
||||
|
||||
if (offset > 0) ao1 += 2; else ao1 += lda;
|
||||
if (offset > -1) ao2 += 2; else ao2 += lda;
|
||||
if (offset > -2) ao3 += 2; else ao3 += lda;
|
||||
if (offset > -3) ao4 += 2; else ao4 += lda;
|
||||
if (offset > -4) ao5 += 2; else ao5 += lda;
|
||||
if (offset > -5) ao6 += 2; else ao6 += lda;
|
||||
if (offset > -6) ao7 += 2; else ao7 += lda;
|
||||
if (offset > -7) ao8 += 2; else ao8 += lda;
|
||||
if (offset > -8) ao9 += 2; else ao9 += lda;
|
||||
if (offset > -9) ao10 += 2; else ao10 += lda;
|
||||
if (offset > -10) ao11 += 2; else ao11 += lda;
|
||||
if (offset > -11) ao12 += 2; else ao12 += lda;
|
||||
if (offset > -12) ao13 += 2; else ao13 += lda;
|
||||
if (offset > -13) ao14 += 2; else ao14 += lda;
|
||||
if (offset > -14) ao15 += 2; else ao15 += lda;
|
||||
if (offset > -15) ao16 += 2; else ao16 += lda;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
b[16] = data17;
|
||||
b[17] = data18;
|
||||
b[18] = data19;
|
||||
b[19] = data20;
|
||||
b[20] = data21;
|
||||
b[21] = data22;
|
||||
b[22] = data23;
|
||||
b[23] = data24;
|
||||
b[24] = data25;
|
||||
b[25] = data26;
|
||||
b[26] = data27;
|
||||
b[27] = data28;
|
||||
b[28] = data29;
|
||||
b[29] = data30;
|
||||
b[30] = data31;
|
||||
b[31] = data32;
|
||||
|
||||
b += 32;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 16;
|
||||
js --;
|
||||
}
|
||||
|
||||
if (n & 8) {
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
|
||||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
|
||||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
|
||||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
|
||||
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
|
||||
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
|
||||
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
|
||||
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
data09 = *(ao5 + 0);
|
||||
data10 = *(ao5 + 1);
|
||||
data11 = *(ao6 + 0);
|
||||
data12 = *(ao6 + 1);
|
||||
data13 = *(ao7 + 0);
|
||||
data14 = *(ao7 + 1);
|
||||
data15 = *(ao8 + 0);
|
||||
data16 = *(ao8 + 1);
|
||||
|
||||
if (offset > 0) ao1 += 2; else ao1 += lda;
|
||||
if (offset > -1) ao2 += 2; else ao2 += lda;
|
||||
if (offset > -2) ao3 += 2; else ao3 += lda;
|
||||
if (offset > -3) ao4 += 2; else ao4 += lda;
|
||||
if (offset > -4) ao5 += 2; else ao5 += lda;
|
||||
if (offset > -5) ao6 += 2; else ao6 += lda;
|
||||
if (offset > -6) ao7 += 2; else ao7 += lda;
|
||||
if (offset > -7) ao8 += 2; else ao8 += lda;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
b[ 8] = data09;
|
||||
b[ 9] = data10;
|
||||
b[10] = data11;
|
||||
b[11] = data12;
|
||||
b[12] = data13;
|
||||
b[13] = data14;
|
||||
b[14] = data15;
|
||||
b[15] = data16;
|
||||
|
||||
b += 16;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
|
||||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
|
||||
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
|
||||
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
data05 = *(ao3 + 0);
|
||||
data06 = *(ao3 + 1);
|
||||
data07 = *(ao4 + 0);
|
||||
data08 = *(ao4 + 1);
|
||||
|
||||
if (offset > 0) ao1 += 2; else ao1 += lda;
|
||||
if (offset > -1) ao2 += 2; else ao2 += lda;
|
||||
if (offset > -2) ao3 += 2; else ao3 += lda;
|
||||
if (offset > -3) ao4 += 2; else ao4 += lda;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
b[ 4] = data05;
|
||||
b[ 5] = data06;
|
||||
b[ 6] = data07;
|
||||
b[ 7] = data08;
|
||||
|
||||
b += 8;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
|
||||
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
data03 = *(ao2 + 0);
|
||||
data04 = *(ao2 + 1);
|
||||
|
||||
if (offset > 0) ao1 += 2; else ao1 += lda;
|
||||
if (offset > -1) ao2 += 2; else ao2 += lda;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
b[ 2] = data03;
|
||||
b[ 3] = data04;
|
||||
|
||||
b += 4;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
posX += 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
offset = posX - posY;
|
||||
|
||||
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
|
||||
|
||||
i = m;
|
||||
|
||||
while (i > 0) {
|
||||
data01 = *(ao1 + 0);
|
||||
data02 = *(ao1 + 1);
|
||||
|
||||
if (offset > 0) ao1 += 2; else ao1 += lda;
|
||||
|
||||
b[ 0] = data01;
|
||||
b[ 1] = data02;
|
||||
|
||||
b += 2;
|
||||
|
||||
offset --;
|
||||
i --;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
2310
kernel/generic/ztrmm_lncopy_16.c
Normal file
2310
kernel/generic/ztrmm_lncopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
2313
kernel/generic/ztrmm_ltcopy_16.c
Normal file
2313
kernel/generic/ztrmm_ltcopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
2316
kernel/generic/ztrmm_uncopy_16.c
Normal file
2316
kernel/generic/ztrmm_uncopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
2318
kernel/generic/ztrmm_utcopy_16.c
Normal file
2318
kernel/generic/ztrmm_utcopy_16.c
Normal file
File diff suppressed because it is too large
Load Diff
308
kernel/generic/ztrsm_lncopy_16.c
Normal file
308
kernel/generic/ztrsm_lncopy_16.c
Normal file
@@ -0,0 +1,308 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, j, jj, k;
|
||||
|
||||
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
|
||||
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
|
||||
|
||||
FLOAT data1, data2;
|
||||
|
||||
lda *= 2;
|
||||
jj = offset;
|
||||
|
||||
j = (n >> 4);
|
||||
while (j > 0){
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a5 = a + 4 * lda;
|
||||
a6 = a + 5 * lda;
|
||||
a7 = a + 6 * lda;
|
||||
a8 = a + 7 * lda;
|
||||
a9 = a + 8 * lda;
|
||||
a10 = a + 9 * lda;
|
||||
a11 = a + 10 * lda;
|
||||
a12 = a + 11 * lda;
|
||||
a13 = a + 12 * lda;
|
||||
a14 = a + 13 * lda;
|
||||
a15 = a + 14 * lda;
|
||||
a16 = a + 15 * lda;
|
||||
|
||||
a += 16 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 16)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 16) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
*(b + 8) = *(a5 + 0);
|
||||
*(b + 9) = *(a5 + 1);
|
||||
*(b + 10) = *(a6 + 0);
|
||||
*(b + 11) = *(a6 + 1);
|
||||
*(b + 12) = *(a7 + 0);
|
||||
*(b + 13) = *(a7 + 1);
|
||||
*(b + 14) = *(a8 + 0);
|
||||
*(b + 15) = *(a8 + 1);
|
||||
*(b + 16) = *(a9 + 0);
|
||||
*(b + 17) = *(a9 + 1);
|
||||
*(b + 18) = *(a10 + 0);
|
||||
*(b + 19) = *(a10 + 1);
|
||||
*(b + 20) = *(a11 + 0);
|
||||
*(b + 21) = *(a11 + 1);
|
||||
*(b + 22) = *(a12 + 0);
|
||||
*(b + 23) = *(a12 + 1);
|
||||
*(b + 24) = *(a13 + 0);
|
||||
*(b + 25) = *(a13 + 1);
|
||||
*(b + 26) = *(a14 + 0);
|
||||
*(b + 27) = *(a14 + 1);
|
||||
*(b + 28) = *(a15 + 0);
|
||||
*(b + 29) = *(a15 + 1);
|
||||
*(b + 30) = *(a16 + 0);
|
||||
*(b + 31) = *(a16 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
a5 += 2;
|
||||
a6 += 2;
|
||||
a7 += 2;
|
||||
a8 += 2;
|
||||
a9 += 2;
|
||||
a10 += 2;
|
||||
a11 += 2;
|
||||
a12 += 2;
|
||||
a13 += 2;
|
||||
a14 += 2;
|
||||
a15 += 2;
|
||||
a16 += 2;
|
||||
b += 32;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 16;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & 8) {
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a5 = a + 4 * lda;
|
||||
a6 = a + 5 * lda;
|
||||
a7 = a + 6 * lda;
|
||||
a8 = a + 7 * lda;
|
||||
|
||||
a += 8 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 8)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 8) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
*(b + 8) = *(a5 + 0);
|
||||
*(b + 9) = *(a5 + 1);
|
||||
*(b + 10) = *(a6 + 0);
|
||||
*(b + 11) = *(a6 + 1);
|
||||
*(b + 12) = *(a7 + 0);
|
||||
*(b + 13) = *(a7 + 1);
|
||||
*(b + 14) = *(a8 + 0);
|
||||
*(b + 15) = *(a8 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
a5 += 2;
|
||||
a6 += 2;
|
||||
a7 += 2;
|
||||
a8 += 2;
|
||||
b += 16;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a += 4 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 4)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 4) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
b += 8;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a += 2 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 2)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 2) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
b += 4;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 1)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 1) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
b += 2;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
264
kernel/generic/ztrsm_ltcopy_16.c
Normal file
264
kernel/generic/ztrsm_ltcopy_16.c
Normal file
@@ -0,0 +1,264 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, j, jj, k;
|
||||
|
||||
FLOAT *a1;
|
||||
FLOAT data1, data2;
|
||||
|
||||
lda *= 2;
|
||||
jj = offset;
|
||||
|
||||
j = (n >> 4);
|
||||
while (j > 0){
|
||||
|
||||
a1 = a;
|
||||
a += 32;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 16)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 16; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
*(b + 8) = *(a1 + 8);
|
||||
*(b + 9) = *(a1 + 9);
|
||||
*(b + 10) = *(a1 + 10);
|
||||
*(b + 11) = *(a1 + 11);
|
||||
*(b + 12) = *(a1 + 12);
|
||||
*(b + 13) = *(a1 + 13);
|
||||
*(b + 14) = *(a1 + 14);
|
||||
*(b + 15) = *(a1 + 15);
|
||||
*(b + 16) = *(a1 + 16);
|
||||
*(b + 17) = *(a1 + 17);
|
||||
*(b + 18) = *(a1 + 18);
|
||||
*(b + 19) = *(a1 + 19);
|
||||
*(b + 20) = *(a1 + 20);
|
||||
*(b + 21) = *(a1 + 21);
|
||||
*(b + 22) = *(a1 + 22);
|
||||
*(b + 23) = *(a1 + 23);
|
||||
*(b + 24) = *(a1 + 24);
|
||||
*(b + 25) = *(a1 + 25);
|
||||
*(b + 26) = *(a1 + 26);
|
||||
*(b + 27) = *(a1 + 27);
|
||||
*(b + 28) = *(a1 + 28);
|
||||
*(b + 29) = *(a1 + 29);
|
||||
*(b + 30) = *(a1 + 30);
|
||||
*(b + 31) = *(a1 + 31);
|
||||
}
|
||||
|
||||
b += 32;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 16;
|
||||
j --;
|
||||
}
|
||||
|
||||
j = (n & 8);
|
||||
if (j > 0) {
|
||||
a1 = a;
|
||||
a += 16;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 8)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 8; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
*(b + 8) = *(a1 + 8);
|
||||
*(b + 9) = *(a1 + 9);
|
||||
*(b + 10) = *(a1 + 10);
|
||||
*(b + 11) = *(a1 + 11);
|
||||
*(b + 12) = *(a1 + 12);
|
||||
*(b + 13) = *(a1 + 13);
|
||||
*(b + 14) = *(a1 + 14);
|
||||
*(b + 15) = *(a1 + 15);
|
||||
}
|
||||
|
||||
b += 16;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 8;
|
||||
}
|
||||
|
||||
j = (n & 4);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
a += 8;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 4)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 4; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
}
|
||||
|
||||
b += 8;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 4;
|
||||
}
|
||||
|
||||
j = (n & 2);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
a += 4;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 2)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 2; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
}
|
||||
|
||||
b += 4;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 2;
|
||||
}
|
||||
|
||||
j = (n & 1);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 1)) {
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
}
|
||||
|
||||
b += 2;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
313
kernel/generic/ztrsm_uncopy_16.c
Normal file
313
kernel/generic/ztrsm_uncopy_16.c
Normal file
@@ -0,0 +1,313 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, j, jj, k;
|
||||
|
||||
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
|
||||
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
|
||||
|
||||
FLOAT data1, data2;
|
||||
|
||||
lda *= 2;
|
||||
jj = offset;
|
||||
|
||||
j = (n >> 4);
|
||||
while (j > 0){
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a5 = a + 4 * lda;
|
||||
a6 = a + 5 * lda;
|
||||
a7 = a + 6 * lda;
|
||||
a8 = a + 7 * lda;
|
||||
a9 = a + 8 * lda;
|
||||
a10 = a + 9 * lda;
|
||||
a11 = a + 10 * lda;
|
||||
a12 = a + 11 * lda;
|
||||
a13 = a + 12 * lda;
|
||||
a14 = a + 13 * lda;
|
||||
a15 = a + 14 * lda;
|
||||
a16 = a + 15 * lda;
|
||||
|
||||
a += 16 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 16)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 16; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
*(b + 8) = *(a5 + 0);
|
||||
*(b + 9) = *(a5 + 1);
|
||||
*(b + 10) = *(a6 + 0);
|
||||
*(b + 11) = *(a6 + 1);
|
||||
*(b + 12) = *(a7 + 0);
|
||||
*(b + 13) = *(a7 + 1);
|
||||
*(b + 14) = *(a8 + 0);
|
||||
*(b + 15) = *(a8 + 1);
|
||||
*(b + 16) = *(a9 + 0);
|
||||
*(b + 17) = *(a9 + 1);
|
||||
*(b + 18) = *(a10 + 0);
|
||||
*(b + 19) = *(a10 + 1);
|
||||
*(b + 20) = *(a11 + 0);
|
||||
*(b + 21) = *(a11 + 1);
|
||||
*(b + 22) = *(a12 + 0);
|
||||
*(b + 23) = *(a12 + 1);
|
||||
*(b + 24) = *(a13 + 0);
|
||||
*(b + 25) = *(a13 + 1);
|
||||
*(b + 26) = *(a14 + 0);
|
||||
*(b + 27) = *(a14 + 1);
|
||||
*(b + 28) = *(a15 + 0);
|
||||
*(b + 29) = *(a15 + 1);
|
||||
*(b + 30) = *(a16 + 0);
|
||||
*(b + 31) = *(a16 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
a5 += 2;
|
||||
a6 += 2;
|
||||
a7 += 2;
|
||||
a8 += 2;
|
||||
a9 += 2;
|
||||
a10 += 2;
|
||||
a11 += 2;
|
||||
a12 += 2;
|
||||
a13 += 2;
|
||||
a14 += 2;
|
||||
a15 += 2;
|
||||
a16 += 2;
|
||||
b += 32;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 16;
|
||||
j --;
|
||||
}
|
||||
|
||||
if (n & 8) {
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a5 = a + 4 * lda;
|
||||
a6 = a + 5 * lda;
|
||||
a7 = a + 6 * lda;
|
||||
a8 = a + 7 * lda;
|
||||
|
||||
a += 8 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 8)) {
|
||||
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 8; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
*(b + 8) = *(a5 + 0);
|
||||
*(b + 9) = *(a5 + 1);
|
||||
*(b + 10) = *(a6 + 0);
|
||||
*(b + 11) = *(a6 + 1);
|
||||
*(b + 12) = *(a7 + 0);
|
||||
*(b + 13) = *(a7 + 1);
|
||||
*(b + 14) = *(a8 + 0);
|
||||
*(b + 15) = *(a8 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
a5 += 2;
|
||||
a6 += 2;
|
||||
a7 += 2;
|
||||
a8 += 2;
|
||||
b += 16;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 8;
|
||||
}
|
||||
|
||||
if (n & 4) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a3 = a + 2 * lda;
|
||||
a4 = a + 3 * lda;
|
||||
a += 4 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 4)) {
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
|
||||
for (k = ii - jj + 1; k < 4; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
*(b + 4) = *(a3 + 0);
|
||||
*(b + 5) = *(a3 + 1);
|
||||
*(b + 6) = *(a4 + 0);
|
||||
*(b + 7) = *(a4 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
a3 += 2;
|
||||
a4 += 2;
|
||||
b += 8;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
a2 = a + 1 * lda;
|
||||
a += 2 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 2)) {
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
for (k = ii - jj + 1; k < 2; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a2 + 0);
|
||||
*(b + 3) = *(a2 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
a2 += 2;
|
||||
b += 4;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
a1 = a + 0 * lda;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 1)) {
|
||||
data1 = *(a1 + (ii - jj) * lda + 0);
|
||||
data2 = *(a1 + (ii - jj) * lda + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
for (k = ii - jj + 1; k < 1; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (ii - jj < 0) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
}
|
||||
|
||||
a1 += 2;
|
||||
b += 2;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
261
kernel/generic/ztrsm_utcopy_16.c
Normal file
261
kernel/generic/ztrsm_utcopy_16.c
Normal file
@@ -0,0 +1,261 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include <stdio.h>
|
||||
#include "common.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
|
||||
|
||||
BLASLONG i, ii, j, jj, k;
|
||||
|
||||
FLOAT *a1, data1, data2;
|
||||
|
||||
lda *= 2;
|
||||
|
||||
jj = offset;
|
||||
|
||||
j = (n >> 4);
|
||||
while (j > 0){
|
||||
|
||||
a1 = a;
|
||||
a += 32;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 16)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 16) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
*(b + 8) = *(a1 + 8);
|
||||
*(b + 9) = *(a1 + 9);
|
||||
*(b + 10) = *(a1 + 10);
|
||||
*(b + 11) = *(a1 + 11);
|
||||
*(b + 12) = *(a1 + 12);
|
||||
*(b + 13) = *(a1 + 13);
|
||||
*(b + 14) = *(a1 + 14);
|
||||
*(b + 15) = *(a1 + 15);
|
||||
*(b + 16) = *(a1 + 16);
|
||||
*(b + 17) = *(a1 + 17);
|
||||
*(b + 18) = *(a1 + 18);
|
||||
*(b + 19) = *(a1 + 19);
|
||||
*(b + 20) = *(a1 + 20);
|
||||
*(b + 21) = *(a1 + 21);
|
||||
*(b + 22) = *(a1 + 22);
|
||||
*(b + 23) = *(a1 + 23);
|
||||
*(b + 24) = *(a1 + 24);
|
||||
*(b + 25) = *(a1 + 25);
|
||||
*(b + 26) = *(a1 + 26);
|
||||
*(b + 27) = *(a1 + 27);
|
||||
*(b + 28) = *(a1 + 28);
|
||||
*(b + 29) = *(a1 + 29);
|
||||
*(b + 30) = *(a1 + 30);
|
||||
*(b + 31) = *(a1 + 31);
|
||||
}
|
||||
|
||||
b += 32;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 16;
|
||||
j --;
|
||||
}
|
||||
|
||||
j = (n & 8);
|
||||
if (j > 0) {
|
||||
a1 = a;
|
||||
a += 16;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 8)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 8) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
*(b + 8) = *(a1 + 8);
|
||||
*(b + 9) = *(a1 + 9);
|
||||
*(b + 10) = *(a1 + 10);
|
||||
*(b + 11) = *(a1 + 11);
|
||||
*(b + 12) = *(a1 + 12);
|
||||
*(b + 13) = *(a1 + 13);
|
||||
*(b + 14) = *(a1 + 14);
|
||||
*(b + 15) = *(a1 + 15);
|
||||
}
|
||||
|
||||
b += 16;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 8;
|
||||
}
|
||||
|
||||
j = (n & 4);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
a += 8;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 4)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 4) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
*(b + 4) = *(a1 + 4);
|
||||
*(b + 5) = *(a1 + 5);
|
||||
*(b + 6) = *(a1 + 6);
|
||||
*(b + 7) = *(a1 + 7);
|
||||
}
|
||||
|
||||
b += 8;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 4;
|
||||
}
|
||||
|
||||
j = (n & 2);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
a += 4;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 2)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 2) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
*(b + 2) = *(a1 + 2);
|
||||
*(b + 3) = *(a1 + 3);
|
||||
}
|
||||
|
||||
b += 4;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
|
||||
jj += 2;
|
||||
}
|
||||
|
||||
j = (n & 1);
|
||||
if (j > 0) {
|
||||
|
||||
a1 = a;
|
||||
ii = 0;
|
||||
|
||||
for (i = 0; i < m; i++) {
|
||||
|
||||
if ((ii >= jj ) && (ii - jj < 1)) {
|
||||
for (k = 0; k < ii - jj; k ++) {
|
||||
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
|
||||
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
|
||||
}
|
||||
|
||||
data1 = *(a1 + (ii - jj) * 2 + 0);
|
||||
data2 = *(a1 + (ii - jj) * 2 + 1);
|
||||
|
||||
compinv(b + (ii - jj) * 2, data1, data2);
|
||||
}
|
||||
|
||||
if (ii - jj >= 1) {
|
||||
*(b + 0) = *(a1 + 0);
|
||||
*(b + 1) = *(a1 + 1);
|
||||
}
|
||||
|
||||
b += 2;
|
||||
a1 += lda;
|
||||
ii ++;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@@ -111,12 +111,19 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMVNKERNEL = sgemv_n_8_lasx.S
|
||||
SGEMVTKERNEL = sgemv_t_8_lasx.S
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_2_lsx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S
|
||||
CGEMMKERNEL = cgemm_kernel_16x4_lasx.S
|
||||
CGEMMINCOPY = cgemm_ncopy_16_lasx.S
|
||||
CGEMMITCOPY = cgemm_tcopy_16_lasx.S
|
||||
CGEMMONCOPY = cgemm_ncopy_4_lasx.S
|
||||
CGEMMOTCOPY = cgemm_tcopy_4_lasx.S
|
||||
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMVNKERNEL = cgemv_n_8_lasx.S
|
||||
CGEMVTKERNEL = cgemv_t_8_lasx.S
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
@@ -132,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_4_lasx.S
|
||||
ZGEMVTKERNEL = zgemv_t_4_lasx.S
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
||||
3757
kernel/loongarch64/cgemm_kernel_16x4_lasx.S
Normal file
3757
kernel/loongarch64/cgemm_kernel_16x4_lasx.S
Normal file
File diff suppressed because it is too large
Load Diff
691
kernel/loongarch64/cgemm_ncopy_16_lasx.S
Normal file
691
kernel/loongarch64/cgemm_ncopy_16_lasx.S
Normal file
@@ -0,0 +1,691 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define S9 $r20
|
||||
#define S10 $r23
|
||||
#define S11 $r24
|
||||
#define S12 $r25
|
||||
#define S13 $r26
|
||||
#define S14 $r27
|
||||
#define S15 $r28
|
||||
#define S16 $r29
|
||||
#define TD $r30
|
||||
#define TS $r31
|
||||
#define TL $r7
|
||||
#define T0 $r6
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
#define D0 $xr16
|
||||
#define D1 $xr17
|
||||
#define D2 $xr18
|
||||
#define D3 $xr19
|
||||
#define D4 $xr20
|
||||
#define D5 $xr21
|
||||
#define D6 $xr22
|
||||
#define D7 $xr23
|
||||
#define D8 $xr24
|
||||
#define D9 $xr25
|
||||
#define D10 $xr26
|
||||
#define D11 $xr27
|
||||
#define D12 $xr28
|
||||
#define D13 $xr29
|
||||
#define D14 $xr30
|
||||
#define D15 $xr31
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -0x90
|
||||
SDARG $r23, $sp, 0x00
|
||||
SDARG $r24, $sp, 0x08
|
||||
SDARG $r25, $sp, 0x10
|
||||
SDARG $r26, $sp, 0x18
|
||||
SDARG $r27, $sp, 0x20
|
||||
SDARG $r28, $sp, 0x28
|
||||
SDARG $r29, $sp, 0x30
|
||||
SDARG $r30, $sp, 0x38
|
||||
SDARG $r31, $sp, 0x40
|
||||
ST $f23, $sp, 0x48
|
||||
ST $f24, $sp, 0x50
|
||||
ST $f25, $sp, 0x58
|
||||
ST $f26, $sp, 0x60
|
||||
ST $f27, $sp, 0x68
|
||||
ST $f28, $sp, 0x70
|
||||
ST $f29, $sp, 0x78
|
||||
ST $f30, $sp, 0x80
|
||||
ST $f31, $sp, 0x88
|
||||
|
||||
move TD, DST
|
||||
move TS, SRC
|
||||
slli.d TL, LDA, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
srai.d J, N, 0x04
|
||||
beq J, ZERO, .L_N8
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x03
|
||||
add.d S3, S2, TL
|
||||
addi.d J, J, -1
|
||||
add.d S4, S3, TL
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d S9, S7, T0
|
||||
add.d S10, S8, T0
|
||||
add.d S11, S9, T0
|
||||
add.d S12, S10, T0
|
||||
add.d S13, S11, T0
|
||||
add.d S14, S12, T0
|
||||
add.d S15, S13, T0
|
||||
add.d S16, S14, T0
|
||||
add.d TS, S15, T0
|
||||
beq I, ZERO, .L_I7
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
xvld U8, S9, 0x00
|
||||
xvld U9, S10, 0x00
|
||||
xvld U10, S11, 0x00
|
||||
xvld U11, S12, 0x00
|
||||
xvld U12, S13, 0x00
|
||||
xvld U13, S14, 0x00
|
||||
xvld U14, S15, 0x00
|
||||
xvld U15, S16, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvpackev.d D8, U9, U8
|
||||
xvpackod.d D9, U9, U8
|
||||
xvpackev.d D10, U11, U10
|
||||
xvpackod.d D11, U11, U10
|
||||
xvpackev.d D12, U13, U12
|
||||
xvpackod.d D13, U13, U12
|
||||
xvpackev.d D14, U15, U14
|
||||
xvpackod.d D15, U15, U14
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 4
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 5
|
||||
xvpermi.q D2, U0, 0x31 // 8
|
||||
xvpermi.q D6, U4, 0x31 // 9
|
||||
xvpermi.q D3, U1, 0x31 // 12
|
||||
xvpermi.q D7, U5, 0x31 // 13
|
||||
|
||||
xvand.v U8, D8, D8
|
||||
xvpermi.q D8, D10, 0x02 // 2
|
||||
xvand.v U12, D12, D12
|
||||
xvpermi.q D12, D14, 0x02 // 3
|
||||
xvand.v U9, D9, D9
|
||||
xvpermi.q D9, D11, 0x02 // 6
|
||||
xvand.v U13, D13, D13
|
||||
xvpermi.q D13, D15, 0x02 // 7
|
||||
xvpermi.q D10, U8, 0x31 // 10
|
||||
xvpermi.q D14, U12, 0x31 // 11
|
||||
xvpermi.q D11, U9, 0x31 // 14
|
||||
xvpermi.q D15, U13, 0x31 // 15
|
||||
|
||||
xvst D0, TD, 0x00 // 0
|
||||
xvst D4, TD, 0x20 // 1
|
||||
xvst D8, TD, 0x40 // 2
|
||||
xvst D12, TD, 0x60 // 3
|
||||
xvst D1, TD, 0x80 // 4
|
||||
xvst D5, TD, 0xA0 // 5
|
||||
xvst D9, TD, 0xC0 // 6
|
||||
xvst D13, TD, 0xE0 // 7
|
||||
addi.d TD, TD, 0x100
|
||||
xvst D2, TD, 0x00 // 8
|
||||
xvst D6, TD, 0x20 // 9
|
||||
xvst D10, TD, 0x40 // 10
|
||||
xvst D14, TD, 0x60 // 11
|
||||
xvst D3, TD, 0x80 // 12
|
||||
xvst D7, TD, 0xA0 // 13
|
||||
xvst D11, TD, 0xC0 // 14
|
||||
xvst D15, TD, 0xE0 // 15
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
xvld U0, S1, 0x20
|
||||
xvld U1, S2, 0x20
|
||||
xvld U2, S3, 0x20
|
||||
xvld U3, S4, 0x20
|
||||
xvld U4, S5, 0x20
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S7, 0x20
|
||||
xvld U7, S8, 0x20
|
||||
xvld U8, S9, 0x20
|
||||
xvld U9, S10, 0x20
|
||||
xvld U10, S11, 0x20
|
||||
xvld U11, S12, 0x20
|
||||
xvld U12, S13, 0x20
|
||||
xvld U13, S14, 0x20
|
||||
xvld U14, S15, 0x20
|
||||
xvld U15, S16, 0x20
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvpackev.d D8, U9, U8
|
||||
xvpackod.d D9, U9, U8
|
||||
xvpackev.d D10, U11, U10
|
||||
xvpackod.d D11, U11, U10
|
||||
xvpackev.d D12, U13, U12
|
||||
xvpackod.d D13, U13, U12
|
||||
xvpackev.d D14, U15, U14
|
||||
xvpackod.d D15, U15, U14
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 4
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 5
|
||||
xvpermi.q D2, U0, 0x31 // 8
|
||||
xvpermi.q D6, U4, 0x31 // 9
|
||||
xvpermi.q D3, U1, 0x31 // 12
|
||||
xvpermi.q D7, U5, 0x31 // 13
|
||||
|
||||
xvand.v U8, D8, D8
|
||||
xvpermi.q D8, D10, 0x02 // 2
|
||||
xvand.v U12, D12, D12
|
||||
xvpermi.q D12, D14, 0x02 // 3
|
||||
xvand.v U9, D9, D9
|
||||
xvpermi.q D9, D11, 0x02 // 6
|
||||
xvand.v U13, D13, D13
|
||||
xvpermi.q D13, D15, 0x02 // 7
|
||||
xvpermi.q D10, U8, 0x31 // 10
|
||||
xvpermi.q D14, U12, 0x31 // 11
|
||||
xvpermi.q D11, U9, 0x31 // 14
|
||||
xvpermi.q D15, U13, 0x31 // 15
|
||||
|
||||
xvst D0, TD, 0x00 // 0
|
||||
xvst D4, TD, 0x20 // 1
|
||||
xvst D8, TD, 0x40 // 2
|
||||
xvst D12, TD, 0x60 // 3
|
||||
xvst D1, TD, 0x80 // 4
|
||||
xvst D5, TD, 0xA0 // 5
|
||||
xvst D9, TD, 0xC0 // 6
|
||||
xvst D13, TD, 0xE0 // 7
|
||||
addi.d TD, TD, 0x100
|
||||
xvst D2, TD, 0x00 // 8
|
||||
xvst D6, TD, 0x20 // 9
|
||||
xvst D10, TD, 0x40 // 10
|
||||
xvst D14, TD, 0x60 // 11
|
||||
xvst D3, TD, 0x80 // 12
|
||||
xvst D7, TD, 0xA0 // 13
|
||||
xvst D11, TD, 0xC0 // 14
|
||||
xvst D15, TD, 0xE0 // 15
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
addi.d S9, S9, 0x40
|
||||
addi.d S10, S10, 0x40
|
||||
addi.d S11, S11, 0x40
|
||||
addi.d S12, S12, 0x40
|
||||
addi.d S13, S13, 0x40
|
||||
addi.d S14, S14, 0x40
|
||||
addi.d S15, S15, 0x40
|
||||
addi.d S16, S16, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_I7:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_I0
|
||||
|
||||
.L_II1: /* I-- */
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S5, S5, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S6, S6, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S7, S7, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
fld.d F0, S9, 0x00
|
||||
fld.d F1, S10, 0x00
|
||||
fld.d F2, S11, 0x00
|
||||
fld.d F3, S12, 0x00
|
||||
fld.d F4, S13, 0x00
|
||||
fld.d F5, S14, 0x00
|
||||
fld.d F6, S15, 0x00
|
||||
fld.d F7, S16, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S9, S9, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S10, S10, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S11, S11, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S12, S12, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S13, S13, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S14, S14, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S15, S15, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S16, S16, 0x08
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_II1
|
||||
|
||||
.L_I0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_N8:
|
||||
andi J, N, 0x08
|
||||
beq ZERO, J, .L_N4
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x03
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d TS, S7, T0
|
||||
beq I, ZERO, .L_8I3
|
||||
|
||||
.L_8I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 2
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 3
|
||||
xvpermi.q D2, U0, 0x31 // 4
|
||||
xvpermi.q D6, U4, 0x31 // 5
|
||||
xvpermi.q D3, U1, 0x31 // 6
|
||||
xvpermi.q D7, U5, 0x31 // 7
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D4, TD, 0x20
|
||||
xvst D1, TD, 0x40
|
||||
xvst D5, TD, 0x60
|
||||
xvst D2, TD, 0x80
|
||||
xvst D6, TD, 0xA0
|
||||
xvst D3, TD, 0xC0
|
||||
xvst D7, TD, 0xE0
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
xvld U0, S1, 0x20
|
||||
xvld U1, S2, 0x20
|
||||
xvld U2, S3, 0x20
|
||||
xvld U3, S4, 0x20
|
||||
xvld U4, S5, 0x20
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S7, 0x20
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
xvpackev.d D4, U5, U4
|
||||
xvpackod.d D5, U5, U4
|
||||
xvpackev.d D6, U7, U6
|
||||
xvpackod.d D7, U7, U6
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U4, D4, D4
|
||||
xvpermi.q D4, D6, 0x02 // 1
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 2
|
||||
xvand.v U5, D5, D5
|
||||
xvpermi.q D5, D7, 0x02 // 3
|
||||
xvpermi.q D2, U0, 0x31 // 4
|
||||
xvpermi.q D6, U4, 0x31 // 5
|
||||
xvpermi.q D3, U1, 0x31 // 6
|
||||
xvpermi.q D7, U5, 0x31 // 7
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D4, TD, 0x20
|
||||
xvst D1, TD, 0x40
|
||||
xvst D5, TD, 0x60
|
||||
xvst D2, TD, 0x80
|
||||
xvst D6, TD, 0xA0
|
||||
xvst D3, TD, 0xC0
|
||||
xvst D7, TD, 0xE0
|
||||
addi.d TD, TD, 0x100
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_8I1
|
||||
|
||||
.L_8I3:
|
||||
andi I, M, 0x07
|
||||
beq I, ZERO, .L_N4
|
||||
|
||||
.L_8I11:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
fld.d F4, S5, 0x00
|
||||
fld.d F5, S6, 0x00
|
||||
fld.d F6, S7, 0x00
|
||||
fld.d F7, S8, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
fst.d F4, TD, 0x20
|
||||
addi.d S5, S5, 0x08
|
||||
fst.d F5, TD, 0x28
|
||||
addi.d S6, S6, 0x08
|
||||
fst.d F6, TD, 0x30
|
||||
addi.d S7, S7, 0x08
|
||||
fst.d F7, TD, 0x38
|
||||
addi.d S8, S8, 0x08
|
||||
|
||||
addi.d TD, TD, 0x40
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_8I11
|
||||
|
||||
.L_N4:
|
||||
andi J, N, 0x04
|
||||
beq ZERO, J, .L_N2
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x02
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S2, T0
|
||||
add.d TS, S3, T0
|
||||
beq I, ZERO, .L_I3
|
||||
|
||||
.L_4I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
xvpackev.d D2, U3, U2
|
||||
xvpackod.d D3, U3, U2
|
||||
|
||||
xvand.v U0, D0, D0
|
||||
xvpermi.q D0, D2, 0x02 // 0
|
||||
xvand.v U1, D1, D1
|
||||
xvpermi.q D1, D3, 0x02 // 1
|
||||
xvpermi.q D2, U0, 0x31 // 2
|
||||
xvpermi.q D3, U1, 0x31 // 3
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
xvst D1, TD, 0x20
|
||||
xvst D2, TD, 0x40
|
||||
xvst D3, TD, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x80
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4I1
|
||||
|
||||
.L_I3:
|
||||
andi I, M, 0x03
|
||||
beq I, ZERO, .L_N2
|
||||
|
||||
.L_4II1:
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
fst.d F2, TD, 0x10
|
||||
addi.d S3, S3, 0x08
|
||||
fst.d F3, TD, 0x18
|
||||
addi.d S4, S4, 0x08
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_4II1
|
||||
|
||||
.L_N2:
|
||||
andi J, N, 0x02
|
||||
beq ZERO, J, .L_N1
|
||||
|
||||
move S1, TS
|
||||
add.d S2, TS, TL
|
||||
srai.d I, M, 0x01
|
||||
add.d TS, S2, TL
|
||||
beq I, ZERO, .L_NI1
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpackev.d D0, U1, U0
|
||||
xvpackod.d D1, U1, U0
|
||||
|
||||
xvpermi.q D0, D1, 0x02 // 0
|
||||
|
||||
xvst D0, TD, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_NI1:
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N1
|
||||
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F1, TD, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
.L_N1:
|
||||
move S1, TS
|
||||
beq ZERO, M, .L_N0
|
||||
|
||||
.L_M1:
|
||||
fld.d F0, S1, 0x00
|
||||
addi.d S1, S1, 0x08
|
||||
fst.d F0, TD, 0x00
|
||||
addi.d TD, TD, 0x08
|
||||
addi.d M, M, -1
|
||||
blt ZERO, M, .L_M1
|
||||
|
||||
.L_N0:
|
||||
LDARG $r23, $sp, 0x00
|
||||
LDARG $r24, $sp, 0x08
|
||||
LDARG $r25, $sp, 0x10
|
||||
LDARG $r26, $sp, 0x18
|
||||
LDARG $r27, $sp, 0x20
|
||||
LDARG $r28, $sp, 0x28
|
||||
LDARG $r29, $sp, 0x30
|
||||
LDARG $r30, $sp, 0x38
|
||||
LDARG $r31, $sp, 0x40
|
||||
LD $f23, $sp, 0x48
|
||||
LD $f24, $sp, 0x50
|
||||
LD $f25, $sp, 0x58
|
||||
LD $f26, $sp, 0x60
|
||||
LD $f27, $sp, 0x68
|
||||
LD $f28, $sp, 0x70
|
||||
LD $f29, $sp, 0x78
|
||||
LD $f30, $sp, 0x80
|
||||
LD $f31, $sp, 0x88
|
||||
addi.d $sp, $sp, 0x90
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
325
kernel/loongarch64/cgemm_ncopy_4_lasx.S
Normal file
325
kernel/loongarch64/cgemm_ncopy_4_lasx.S
Normal file
@@ -0,0 +1,325 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define TD $r20
|
||||
#define TS $r11
|
||||
#define TL $r19
|
||||
#define T0 $r23
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define D0 $xr8
|
||||
#define D1 $xr9
|
||||
#define D2 $xr10
|
||||
#define D3 $xr11
|
||||
#define D4 $xr12
|
||||
#define D5 $xr13
|
||||
#define D6 $xr14
|
||||
#define D7 $xr15
|
||||
#define D8 $xr16
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TD, DST //boffset
|
||||
move TS, SRC //aoffset
|
||||
|
||||
slli.d TL, LDA, 0x02
|
||||
slli.d TL, TL, 0x01
|
||||
|
||||
srai.d J, N, 0x02
|
||||
beq J, ZERO, .L_N0
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, TS
|
||||
add.d S2, S1, TL
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S3, TL
|
||||
|
||||
slli.d T0, TL, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
srai.d I, M, 0x02
|
||||
beq I, ZERO, .L_I3
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
|
||||
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
|
||||
xvld U2, S3, 0x00 //17 18 19 20 21 22 23 24
|
||||
xvld U3, S4, 0x00 //25 26 27 28 29 30 31 32
|
||||
|
||||
xvand.v D0, U0, U0
|
||||
xvand.v D1, U1, U1
|
||||
xvand.v D2, U2, U2
|
||||
xvand.v D3, U3, U3
|
||||
|
||||
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
|
||||
xvshuf4i.d D2, U3, 0x88 //17 18 25 26 21 22 29 30
|
||||
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
|
||||
xvshuf4i.d D3, U2, 0x77 //19 20 27 28 23 24 31 32
|
||||
|
||||
xvand.v U4, D0, D0
|
||||
xvand.v U5, D1, D1
|
||||
|
||||
xvpermi.q U4, D2, 0x02 //1 2 9 10 17 18 25 26
|
||||
xvpermi.q U5, D3, 0x02 //3 4 11 12 19 20 27 28
|
||||
xvpermi.q D2, D0, 0x31 //5 6 13 14 21 22 29 30
|
||||
xvpermi.q D3, D1, 0x31 //7 8 15 16 23 24 31 32
|
||||
|
||||
xvst U4, TD, 0x00
|
||||
xvst U5, TD, 0x20
|
||||
xvst D2, TD, 0x40
|
||||
xvst D3, TD, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d TD, TD, 0x80 // b_offset
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_I3: /* if(m&2) */
|
||||
andi I, M, 0x02
|
||||
beq I, ZERO, .L_II20
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
vld $vr1, S2, 0x00
|
||||
vld $vr2, S3, 0x00
|
||||
vld $vr3, S4, 0x00
|
||||
|
||||
vand.v $vr8, $vr1, $vr1
|
||||
vand.v $vr9, $vr1, $vr1
|
||||
vand.v $vr10, $vr3, $vr3
|
||||
vand.v $vr11, $vr3, $vr3
|
||||
|
||||
vpermi.w $vr8, $vr0, 0x44
|
||||
vpermi.w $vr10, $vr2, 0x44
|
||||
vpermi.w $vr9, $vr0, 0xee
|
||||
vpermi.w $vr11, $vr2, 0xee
|
||||
|
||||
vst $vr8, TD, 0x00
|
||||
vst $vr10, TD, 0x10
|
||||
vst $vr9, TD, 0x20
|
||||
vst $vr11, TD, 0x30
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
.L_II20: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_J0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fld.s F4, S3, 0x00
|
||||
fld.s F5, S3, 0x04
|
||||
|
||||
fld.s F6, S4, 0x00
|
||||
fld.s F7, S4, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
fst.s F4, TD, 0x10
|
||||
fst.s F5, TD, 0x14
|
||||
fst.s F6, TD, 0x18
|
||||
fst.s F7, TD, 0x1c
|
||||
|
||||
addi.d TD, TD, 0x20
|
||||
|
||||
.L_J0:
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_N0: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N20
|
||||
|
||||
move S1, TS
|
||||
add.d S2, S1, TL
|
||||
|
||||
slli.d T0, TL, 0x01
|
||||
add.d TS, TS, T0
|
||||
|
||||
srai.d I, M, 0x02
|
||||
beq ZERO, I, .L_N10
|
||||
|
||||
.L_N11: /* if(i>0) */
|
||||
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
|
||||
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
|
||||
|
||||
xvand.v D0, U0, U0
|
||||
xvand.v D1, U1, U1
|
||||
|
||||
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
|
||||
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
|
||||
|
||||
xvand.v U4, D0, D0
|
||||
|
||||
xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12
|
||||
xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16
|
||||
|
||||
xvst U4, TD, 0x00
|
||||
xvst D1, TD, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20 // a_offset
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d TD, TD, 0x40 // b_offset
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_N11
|
||||
|
||||
.L_N10: /* if(m&2) */
|
||||
andi I, M, 0x02
|
||||
beq I, ZERO, .L_N130
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
vld $vr1, S2, 0x00
|
||||
vand.v $vr8, $vr1, $vr1
|
||||
|
||||
vpermi.w $vr8, $vr0, 0x44
|
||||
vpermi.w $vr1, $vr0, 0xee
|
||||
|
||||
vst $vr8, TD, 0x00
|
||||
vst $vr1, TD, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10 // a_offset
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
.L_N130: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N20
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
fst.s F2, TD, 0x08
|
||||
fst.s F3, TD, 0x0c
|
||||
|
||||
addi.d TD, TD, 0x10
|
||||
|
||||
.L_N20: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq I, ZERO, .L_N00
|
||||
|
||||
move S1, TS
|
||||
srai.d I, M, 0x02
|
||||
|
||||
beq I, ZERO, .L_N30
|
||||
|
||||
.L_N21: /* if(i>0) */
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
xvst U0, TD, 0x00
|
||||
|
||||
addi.d S1, S1, 0x20 // aoffset1
|
||||
addi.d TD, TD, 0x20 // b_offset
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_N21
|
||||
|
||||
.L_N30: /* if(m&2) */
|
||||
andi I, M, 0x02
|
||||
beq I, ZERO, .L_N330
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
|
||||
vst $vr0, TD, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10 // aoffset1
|
||||
addi.d TD, TD, 0x10 // b_offset
|
||||
|
||||
.L_N330: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq I, ZERO, .L_N00
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fst.s F0, TD, 0x00
|
||||
fst.s F1, TD, 0x04
|
||||
|
||||
.L_N00:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
741
kernel/loongarch64/cgemm_tcopy_16_lasx.S
Normal file
741
kernel/loongarch64/cgemm_tcopy_16_lasx.S
Normal file
@@ -0,0 +1,741 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S0 $r11
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define S5 $r16
|
||||
#define S6 $r17
|
||||
#define S7 $r18
|
||||
#define S8 $r19
|
||||
#define P0 $r20
|
||||
#define P1 $r23
|
||||
#define P2 $r24
|
||||
#define P3 $r25
|
||||
#define P4 $r26
|
||||
#define P5 $r27
|
||||
#define T0 $r28
|
||||
#define T1 $r29
|
||||
#define TL $r7
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
#define F8 $f8
|
||||
#define F9 $f9
|
||||
#define F10 $f10
|
||||
#define F11 $f11
|
||||
#define F12 $f12
|
||||
#define F13 $f13
|
||||
#define F14 $f14
|
||||
#define F15 $f15
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -56
|
||||
SDARG $r23, $sp, 0
|
||||
SDARG $r24, $sp, 8
|
||||
SDARG $r25, $sp, 16
|
||||
SDARG $r26, $sp, 24
|
||||
SDARG $r27, $sp, 32
|
||||
SDARG $r28, $sp, 40
|
||||
SDARG $r29, $sp, 48
|
||||
|
||||
move S0, SRC
|
||||
move P0, DST
|
||||
|
||||
srai.d T0, N, 0x04
|
||||
srai.d T1, N, 0x03
|
||||
slli.d T0, T0, 0x04
|
||||
slli.d T1, T1, 0x03
|
||||
mul.d P2, M, T0
|
||||
mul.d P3, M, T1
|
||||
slli.d P2, P2, 0x03
|
||||
slli.d P3, P3, 0x03
|
||||
add.d P2, DST, P2
|
||||
add.d P3, DST, P3
|
||||
|
||||
srai.d T0, N, 0x02
|
||||
srai.d T1, N, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
slli.d T1, T1, 0x01
|
||||
mul.d P4, M, T0
|
||||
mul.d P5, M, T1
|
||||
slli.d P4, P4, 0x03
|
||||
slli.d P5, P5, 0x03
|
||||
add.d P4, DST, P4
|
||||
add.d P5, DST, P5
|
||||
|
||||
slli.d TL, LDA, 0x03
|
||||
srai.d J, M, 0x03
|
||||
slli.d T0, TL, 0x01
|
||||
slli.d T1, M, 0x07
|
||||
beq ZERO, J, .L_M7
|
||||
|
||||
.L_J1: /* J-- */
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S5, S3, T0
|
||||
add.d S6, S4, T0
|
||||
add.d S7, S5, T0
|
||||
add.d S8, S6, T0
|
||||
add.d S0, S7, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x400
|
||||
|
||||
srai.d I, N, 0x04
|
||||
addi.d J, J, -1
|
||||
beq ZERO, I, .L_N15
|
||||
|
||||
.L_I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S3, 0x00
|
||||
xvld U1, S3, 0x20
|
||||
xvld U2, S3, 0x40
|
||||
xvld U3, S3, 0x60
|
||||
xvld U4, S4, 0x00
|
||||
xvld U5, S4, 0x20
|
||||
xvld U6, S4, 0x40
|
||||
xvld U7, S4, 0x60
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S5, 0x40
|
||||
xvld U3, S5, 0x60
|
||||
xvld U4, S6, 0x00
|
||||
xvld U5, S6, 0x20
|
||||
xvld U6, S6, 0x40
|
||||
xvld U7, S6, 0x60
|
||||
|
||||
xvst U0, P1, 0x200
|
||||
xvst U1, P1, 0x220
|
||||
xvst U2, P1, 0x240
|
||||
xvst U3, P1, 0x260
|
||||
xvst U4, P1, 0x280
|
||||
xvst U5, P1, 0x2A0
|
||||
xvst U6, P1, 0x2C0
|
||||
xvst U7, P1, 0x2E0
|
||||
|
||||
xvld U0, S7, 0x00
|
||||
xvld U1, S7, 0x20
|
||||
xvld U2, S7, 0x40
|
||||
xvld U3, S7, 0x60
|
||||
xvld U4, S8, 0x00
|
||||
xvld U5, S8, 0x20
|
||||
xvld U6, S8, 0x40
|
||||
xvld U7, S8, 0x60
|
||||
|
||||
xvst U0, P1, 0x300
|
||||
xvst U1, P1, 0x320
|
||||
xvst U2, P1, 0x340
|
||||
xvst U3, P1, 0x360
|
||||
xvst U4, P1, 0x380
|
||||
xvst U5, P1, 0x3A0
|
||||
xvst U6, P1, 0x3C0
|
||||
xvst U7, P1, 0x3E0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d S3, S3, 0x80
|
||||
addi.d S4, S4, 0x80
|
||||
addi.d S5, S5, 0x80
|
||||
addi.d S6, S6, 0x80
|
||||
addi.d S7, S7, 0x80
|
||||
addi.d S8, S8, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_I1
|
||||
|
||||
.L_N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
xvst U4, P2, 0x80
|
||||
xvst U5, P2, 0xA0
|
||||
xvst U6, P2, 0xC0
|
||||
xvst U7, P2, 0xE0
|
||||
|
||||
xvld U0, S5, 0x00
|
||||
xvld U1, S5, 0x20
|
||||
xvld U2, S6, 0x00
|
||||
xvld U3, S6, 0x20
|
||||
xvld U4, S7, 0x00
|
||||
xvld U5, S7, 0x20
|
||||
xvld U6, S8, 0x00
|
||||
xvld U7, S8, 0x20
|
||||
|
||||
xvst U0, P2, 0x100
|
||||
xvst U1, P2, 0x120
|
||||
xvst U2, P2, 0x140
|
||||
xvst U3, P2, 0x160
|
||||
xvst U4, P2, 0x180
|
||||
xvst U5, P2, 0x1A0
|
||||
xvst U6, P2, 0x1C0
|
||||
xvst U7, P2, 0x1E0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d S5, S5, 0x40
|
||||
addi.d S6, S6, 0x40
|
||||
addi.d S7, S7, 0x40
|
||||
addi.d S8, S8, 0x40
|
||||
addi.d P2, P2, 0x200
|
||||
|
||||
.L_N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
xvst U2, P3, 0x40
|
||||
xvst U3, P3, 0x60
|
||||
xvst U4, P3, 0x80
|
||||
xvst U5, P3, 0xA0
|
||||
xvst U6, P3, 0xC0
|
||||
xvst U7, P3, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d S5, S5, 0x20
|
||||
addi.d S6, S6, 0x20
|
||||
addi.d S7, S7, 0x20
|
||||
addi.d S8, S8, 0x20
|
||||
addi.d P3, P3, 0x100
|
||||
|
||||
.L_N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
xvld U4, S5, 0x00
|
||||
xvld U5, S6, 0x00
|
||||
xvld U6, S7, 0x00
|
||||
xvld U7, S8, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
xvpermi.q U2, U3, 0x02
|
||||
xvpermi.q U4, U5, 0x02
|
||||
xvpermi.q U6, U7, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
xvst U2, P4, 0x20
|
||||
xvst U4, P4, 0x40
|
||||
xvst U6, P4, 0x60
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d S5, S5, 0x10
|
||||
addi.d S6, S6, 0x10
|
||||
addi.d S7, S7, 0x10
|
||||
addi.d S8, S8, 0x10
|
||||
addi.d P4, P4, 0x80
|
||||
|
||||
.L_N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_N0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fld.s F4, S3, 0x00
|
||||
fld.s F5, S3, 0x04
|
||||
|
||||
fld.s F6, S4, 0x00
|
||||
fld.s F7, S4, 0x04
|
||||
|
||||
fld.s F8, S5, 0x00
|
||||
fld.s F9, S5, 0x04
|
||||
|
||||
fld.s F10, S6, 0x00
|
||||
fld.s F11, S6, 0x04
|
||||
|
||||
fld.s F12, S7, 0x00
|
||||
fld.s F13, S7, 0x04
|
||||
|
||||
fld.s F14, S8, 0x00
|
||||
fld.s F15, S8, 0x04
|
||||
|
||||
fst.s F0, P5, 0x00
|
||||
fst.s F1, P5, 0x04
|
||||
fst.s F2, P5, 0x08
|
||||
fst.s F3, P5, 0x0c
|
||||
fst.s F4, P5, 0x10
|
||||
fst.s F5, P5, 0x14
|
||||
fst.s F6, P5, 0x18
|
||||
fst.s F7, P5, 0x1c
|
||||
fst.s F8, P5, 0x20
|
||||
fst.s F9, P5, 0x24
|
||||
fst.s F10, P5, 0x28
|
||||
fst.s F11, P5, 0x2c
|
||||
fst.s F12, P5, 0x30
|
||||
fst.s F13, P5, 0x34
|
||||
fst.s F14, P5, 0x38
|
||||
fst.s F15, P5, 0x3c
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d S5, S5, 0x08
|
||||
addi.d S6, S6, 0x08
|
||||
addi.d S7, S7, 0x08
|
||||
addi.d S8, S8, 0x08
|
||||
addi.d P5, P5, 0x40
|
||||
|
||||
.L_N0:
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M7:
|
||||
andi J, M, 0x04
|
||||
beq ZERO, J, .L_M3
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S3, S1, T0
|
||||
add.d S4, S2, T0
|
||||
add.d S0, S3, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x200
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_4N15
|
||||
|
||||
.L_4I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
xvld U0, S3, 0x00
|
||||
xvld U1, S3, 0x20
|
||||
xvld U2, S3, 0x40
|
||||
xvld U3, S3, 0x60
|
||||
xvld U4, S4, 0x00
|
||||
xvld U5, S4, 0x20
|
||||
xvld U6, S4, 0x40
|
||||
xvld U7, S4, 0x60
|
||||
|
||||
xvst U0, P1, 0x100
|
||||
xvst U1, P1, 0x120
|
||||
xvst U2, P1, 0x140
|
||||
xvst U3, P1, 0x160
|
||||
xvst U4, P1, 0x180
|
||||
xvst U5, P1, 0x1A0
|
||||
xvst U6, P1, 0x1C0
|
||||
xvst U7, P1, 0x1E0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d S3, S3, 0x80
|
||||
addi.d S4, S4, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_4I1
|
||||
|
||||
.L_4N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_4N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
xvld U4, S3, 0x00
|
||||
xvld U5, S3, 0x20
|
||||
xvld U6, S4, 0x00
|
||||
xvld U7, S4, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
xvst U4, P2, 0x80
|
||||
xvst U5, P2, 0xA0
|
||||
xvst U6, P2, 0xC0
|
||||
xvst U7, P2, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d S3, S3, 0x40
|
||||
addi.d S4, S4, 0x40
|
||||
addi.d P2, P2, 0x100
|
||||
|
||||
.L_4N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_4N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
xvst U2, P3, 0x40
|
||||
xvst U3, P3, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
addi.d P3, P3, 0x80
|
||||
|
||||
.L_4N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_4N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
xvpermi.q U2, U3, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
xvst U2, P4, 0x20
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d P4, P4, 0x40
|
||||
|
||||
.L_4N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M3
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
fld.d F2, S3, 0x00
|
||||
fld.d F3, S4, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
fst.d F1, P5, 0x08
|
||||
fst.d F2, P5, 0x10
|
||||
fst.d F3, P5, 0x18
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d S3, S3, 0x08
|
||||
addi.d S4, S4, 0x08
|
||||
addi.d P5, P5, 0x20
|
||||
|
||||
.L_M3:
|
||||
andi J, M, 0x02
|
||||
beq ZERO, J, .L_M1
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
add.d S0, S0, T0
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x100
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_2N15
|
||||
|
||||
.L_2I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
xvld U4, S2, 0x00
|
||||
xvld U5, S2, 0x20
|
||||
xvld U6, S2, 0x40
|
||||
xvld U7, S2, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
xvst U4, P1, 0x80
|
||||
xvst U5, P1, 0xA0
|
||||
xvst U6, P1, 0xC0
|
||||
xvst U7, P1, 0xE0
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d S2, S2, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_2I1
|
||||
|
||||
.L_2N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_2N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S2, 0x00
|
||||
xvld U3, S2, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
xvst U2, P2, 0x40
|
||||
xvst U3, P2, 0x60
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d S2, S2, 0x40
|
||||
addi.d P2, P2, 0x80
|
||||
|
||||
.L_2N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_2N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
xvst U1, P3, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d P3, P3, 0x40
|
||||
|
||||
.L_2N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_2N1
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvpermi.q U0, U1, 0x02
|
||||
|
||||
xvst U0, P4, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d P4, P4, 0x20
|
||||
|
||||
.L_2N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S2, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
fst.d F1, P5, 0x08
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d S2, S2, 0x08
|
||||
addi.d P5, P5, 0x10
|
||||
|
||||
.L_M1:
|
||||
andi J, M, 0x01
|
||||
beq ZERO, J, .L_M0
|
||||
|
||||
move S1, S0
|
||||
add.d S2, S0, TL
|
||||
|
||||
move P1, P0
|
||||
addi.d P0, P0, 0x80
|
||||
|
||||
srai.d I, N, 0x04
|
||||
beq ZERO, I, .L_1N15
|
||||
|
||||
.L_1I1: /* I-- */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
xvld U2, S1, 0x40
|
||||
xvld U3, S1, 0x60
|
||||
|
||||
xvst U0, P1, 0x00
|
||||
xvst U1, P1, 0x20
|
||||
xvst U2, P1, 0x40
|
||||
xvst U3, P1, 0x60
|
||||
|
||||
addi.d S1, S1, 0x80
|
||||
addi.d I, I, -1
|
||||
add.d P1, P1, T1
|
||||
blt ZERO, I, .L_1I1
|
||||
|
||||
.L_1N15:
|
||||
andi I, N, 0x08
|
||||
beq ZERO, I, .L_1N7
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S1, 0x20
|
||||
|
||||
xvst U0, P2, 0x00
|
||||
xvst U1, P2, 0x20
|
||||
|
||||
addi.d S1, S1, 0x40
|
||||
addi.d P2, P2, 0x40
|
||||
|
||||
.L_1N7:
|
||||
andi I, N, 0x04
|
||||
beq ZERO, I, .L_1N3
|
||||
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
xvst U0, P3, 0x00
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d P3, P3, 0x20
|
||||
|
||||
.L_1N3:
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_1N1
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
fld.d F1, S1, 0x08
|
||||
|
||||
fst.d F0, P4, 0x00
|
||||
fst.d F1, P4, 0x08
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d P4, P4, 0x10
|
||||
|
||||
.L_1N1:
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.d F0, S1, 0x00
|
||||
|
||||
fst.d F0, P5, 0x00
|
||||
|
||||
addi.d S1, S1, 0x08
|
||||
addi.d P5, P5, 0x08
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
LDARG $r24, $sp, 8
|
||||
LDARG $r25, $sp, 16
|
||||
LDARG $r26, $sp, 24
|
||||
LDARG $r27, $sp, 32
|
||||
LDARG $r28, $sp, 40
|
||||
LDARG $r29, $sp, 48
|
||||
addi.d $sp, $sp, 56
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
306
kernel/loongarch64/cgemm_tcopy_4_lasx.S
Normal file
306
kernel/loongarch64/cgemm_tcopy_4_lasx.S
Normal file
@@ -0,0 +1,306 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
|
||||
/* Function parameters */
|
||||
#define M $r4 // param 1: m
|
||||
#define N $r5 // param 2: n
|
||||
#define SRC $r6 // param 3: src
|
||||
#define LDA $r7 // param 4: lda
|
||||
#define DST $r8 // param 5: dst
|
||||
|
||||
#define I $r9
|
||||
#define J $r10
|
||||
#define S1 $r12
|
||||
#define S2 $r13
|
||||
#define S3 $r14
|
||||
#define S4 $r15
|
||||
#define TD $r16
|
||||
#define TS $r17
|
||||
#define TL $r18
|
||||
#define T0 $r19
|
||||
#define S8 $r20
|
||||
#define S9 $r23
|
||||
#define S10 $r11
|
||||
#define ZERO $r0
|
||||
|
||||
#define F0 $f0
|
||||
#define F1 $f1
|
||||
#define F2 $f2
|
||||
#define F3 $f3
|
||||
#define F4 $f4
|
||||
#define F5 $f5
|
||||
#define F6 $f6
|
||||
#define F7 $f7
|
||||
|
||||
/* LASX vectors */
|
||||
#define U0 $xr0
|
||||
#define U1 $xr1
|
||||
#define U2 $xr2
|
||||
#define U3 $xr3
|
||||
#define U4 $xr4
|
||||
#define U5 $xr5
|
||||
#define U6 $xr6
|
||||
#define U7 $xr7
|
||||
#define U8 $xr8
|
||||
#define U9 $xr9
|
||||
#define U10 $xr10
|
||||
#define U11 $xr11
|
||||
#define U12 $xr12
|
||||
#define U13 $xr13
|
||||
#define U14 $xr14
|
||||
#define U15 $xr15
|
||||
|
||||
|
||||
PROLOGUE
|
||||
|
||||
addi.d $sp, $sp, -8
|
||||
SDARG $r23, $sp, 0
|
||||
|
||||
move TS, SRC //aoffset
|
||||
move TD, DST //boffset
|
||||
slli.d TL, LDA, 0x02 //lda
|
||||
slli.d TL, TL, 0x01 //lda
|
||||
|
||||
ori T0, ZERO, 0x03
|
||||
andn T0, N, T0
|
||||
mul.w T0, M, T0
|
||||
slli.d T0, T0, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
add.d S9, DST, T0 //boffset2
|
||||
|
||||
ori T0, ZERO, 0x01
|
||||
andn T0, N, T0
|
||||
mul.w T0, M, T0
|
||||
slli.d T0, T0, 0x01
|
||||
slli.d T0, T0, 0x02
|
||||
add.d S10, DST, T0 //boffset3
|
||||
|
||||
srai.d J, M, 0x02 //j
|
||||
|
||||
beq J, ZERO, .L_M1
|
||||
|
||||
.L_J1: /* if(j>0) j--*/
|
||||
move S1, TS //aoffset1
|
||||
add.d S2, S1, TL
|
||||
add.d S3, S2, TL
|
||||
add.d S4, S3, TL
|
||||
|
||||
slli.d T0, TL, 0x02
|
||||
add.d TS, TS, T0
|
||||
|
||||
move S8, TD //boffset1
|
||||
addi.d TD, TD, 0x80
|
||||
|
||||
srai.d I, N, 0x02
|
||||
|
||||
beq ZERO, I, .L_JN1
|
||||
|
||||
.L_JI1: /* if(i>0) i--*/
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
xvld U2, S3, 0x00
|
||||
xvld U3, S4, 0x00
|
||||
|
||||
xvst U0, S8, 0x00
|
||||
xvst U1, S8, 0x20
|
||||
xvst U2, S8, 0x40
|
||||
xvst U3, S8, 0x60
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
addi.d S3, S3, 0x20
|
||||
addi.d S4, S4, 0x20
|
||||
slli.d T0, M, 0x05
|
||||
add.d S8, S8, T0
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_JI1
|
||||
|
||||
.L_JN1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_JN2
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
vld $vr1, S2, 0x00
|
||||
vld $vr2, S3, 0x00
|
||||
vld $vr3, S4, 0x00
|
||||
|
||||
vst $vr0, S9, 0x00
|
||||
vst $vr1, S9, 0x10
|
||||
vst $vr2, S9, 0x20
|
||||
vst $vr3, S9, 0x30
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S3, S3, 0x10
|
||||
addi.d S4, S4, 0x10
|
||||
addi.d S9, S9, 0x40
|
||||
|
||||
.L_JN2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_J0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fld.s F4, S3, 0x00
|
||||
fld.s F5, S3, 0x04
|
||||
|
||||
fld.s F6, S4, 0x00
|
||||
fld.s F7, S4, 0x04
|
||||
|
||||
fst.s F0, S10, 0x00
|
||||
fst.s F1, S10, 0x04
|
||||
fst.s F2, S10, 0x08
|
||||
fst.s F3, S10, 0x0c
|
||||
fst.s F4, S10, 0x10
|
||||
fst.s F5, S10, 0x14
|
||||
fst.s F6, S10, 0x18
|
||||
fst.s F7, S10, 0x1c
|
||||
|
||||
addi.d S10, S10, 0x20
|
||||
|
||||
.L_J0:
|
||||
addi.d J, J, -1
|
||||
blt ZERO, J, .L_J1
|
||||
|
||||
.L_M1: /* if(m&2) */
|
||||
andi I, M, 0x02
|
||||
beq ZERO, I, .L_M2
|
||||
|
||||
move S1, TS //aoffset1
|
||||
add.d S2, S1, TL
|
||||
|
||||
slli.d T0, TL, 0x01
|
||||
add.d TS, TS, T0
|
||||
|
||||
move S8, TD //boffset1
|
||||
addi.d TD, TD, 0x40
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_M1N1
|
||||
|
||||
.L_M1I1: /* if(i>0) */
|
||||
xvld U0, S1, 0x00
|
||||
xvld U1, S2, 0x00
|
||||
|
||||
xvst U0, S8, 0x00
|
||||
xvst U1, S8, 0x20
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
addi.d S2, S2, 0x20
|
||||
slli.d T0, M, 0x05
|
||||
add.d S8, S8, T0
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_M1I1
|
||||
|
||||
.L_M1N1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M1N2
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
vld $vr1, S2, 0x00
|
||||
|
||||
vst $vr0, S9, 0x00
|
||||
vst $vr1, S9, 0x10
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
addi.d S2, S2, 0x10
|
||||
addi.d S9, S9, 0x20
|
||||
|
||||
.L_M1N2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M2
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fld.s F2, S2, 0x00
|
||||
fld.s F3, S2, 0x04
|
||||
|
||||
fst.s F0, S10, 0x00
|
||||
fst.s F1, S10, 0x04
|
||||
fst.s F2, S10, 0x08
|
||||
fst.s F3, S10, 0x0c
|
||||
|
||||
addi.d S10, S10, 0x10
|
||||
|
||||
.L_M2: /* if(m&1) */
|
||||
andi I, M, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
move S1, TS //aoffset1
|
||||
move S8, TD //boffset1
|
||||
|
||||
srai.d I, N, 0x02
|
||||
beq ZERO, I, .L_M2N1
|
||||
|
||||
.L_M2I1: /* if(i>0) */
|
||||
xvld U0, S1, 0x00
|
||||
|
||||
xvst U0, S8, 0x00
|
||||
|
||||
addi.d S1, S1, 0x20
|
||||
slli.d T0, M, 0x05
|
||||
add.d S8, S8, T0
|
||||
|
||||
addi.d I, I, -1
|
||||
blt ZERO, I, .L_M2I1
|
||||
|
||||
.L_M2N1: /* if(n&2) */
|
||||
andi I, N, 0x02
|
||||
beq ZERO, I, .L_M2N2
|
||||
|
||||
vld $vr0, S1, 0x00
|
||||
|
||||
vst $vr0, S9, 0x00
|
||||
|
||||
addi.d S1, S1, 0x10
|
||||
|
||||
.L_M2N2: /* if(n&1) */
|
||||
andi I, N, 0x01
|
||||
beq ZERO, I, .L_M0
|
||||
|
||||
fld.s F0, S1, 0x00
|
||||
fld.s F1, S1, 0x04
|
||||
|
||||
fst.s F0, S10, 0x00
|
||||
fst.s F1, S10, 0x04
|
||||
|
||||
.L_M0:
|
||||
LDARG $r23, $sp, 0
|
||||
addi.d $sp, $sp, 8
|
||||
jirl $r0, $r1, 0x00
|
||||
|
||||
EPILOGUE
|
||||
383
kernel/loongarch64/cgemv_n_8_lasx.S
Normal file
383
kernel/loongarch64/cgemv_n_8_lasx.S
Normal file
@@ -0,0 +1,383 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define Y_ORG $r15
|
||||
#define OFFSET $r16
|
||||
#define K_LDA $r17
|
||||
#define M8 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
|
||||
#define VALPHA $xr1
|
||||
#define X0 $xr2
|
||||
#define X1 $xr3
|
||||
#define X2 $xr4
|
||||
#define X3 $xr5
|
||||
#define X4 $xr6
|
||||
#define X5 $xr7
|
||||
#define X6 $xr8
|
||||
#define X7 $xr9
|
||||
#define Y0 $xr10
|
||||
#define Y1 $xr11
|
||||
#define A0 $xr12
|
||||
#define A1 $xr13
|
||||
#define A2 $xr14
|
||||
#define A3 $xr15
|
||||
#define A4 $xr16
|
||||
#define A5 $xr17
|
||||
#define A6 $xr18
|
||||
#define A7 $xr19
|
||||
#define A8 $xr20
|
||||
#define A9 $xr21
|
||||
#define A10 $xr22
|
||||
#define A11 $xr23
|
||||
#define A12 $xr24
|
||||
#define A13 $xr25
|
||||
#define A14 $xr26
|
||||
#define A15 $xr27
|
||||
#define TMP0 $xr28
|
||||
#define TMP1 $xr29
|
||||
#define TMP2 $xr30
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 0
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 0
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 1
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.macro CLOAD_X_8
|
||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
|
||||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X_8_GAP
|
||||
xvldrepl.d X0, X, 0x00
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvldrepl.d X1, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X2, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X3, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X4, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X5, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X6, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X7, T0, 0x00
|
||||
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_8
|
||||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_8_GAP
|
||||
fld.d $f10, Y, 0
|
||||
fldx.d $f13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
fld.d $f14, T0, 0
|
||||
fldx.d $f15, T0, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 2
|
||||
fld.d $f11, T0, 0
|
||||
fldx.d $f17, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
fld.d $f18, T0, 0
|
||||
fldx.d $f19, T0, INC_Y
|
||||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_8_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 3
|
||||
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 0
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 3
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_8
|
||||
GST xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X_1
|
||||
GLDREPL xv, d, X0, X, 0x00
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_1
|
||||
fld.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_1x8
|
||||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
|
||||
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_1
|
||||
fst.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_1x1
|
||||
fld.d $f12, PA0, 0
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
CLOAD_\X_8
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
CLOAD_\Y_8
|
||||
CGEMV_N_8x8
|
||||
CSTORE_\Y_8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
PTR_ADDI K, K, 8
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
andi I, M, 7
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
CLOAD_\Y_1
|
||||
CGEMV_N_1x8
|
||||
CSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 7
|
||||
beqz J, .L_END
|
||||
.L_\XW\()_N_L1:
|
||||
CLOAD_\X_1
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
CLOAD_\Y_1
|
||||
CGEMV_N_1x1
|
||||
CSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_SUB K_LDA, LDA, M8
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez J, .L_\XW\()_N_L1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||
PTR_ALSL I, I, J, 1
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
// Init VALPHA
|
||||
xvpackev.w $xr0, $xr1, $xr0
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move Y_ORG, Y
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0 // Obtain the offset address
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||
CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
|
||||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||
CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
|
||||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||
CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
|
||||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
342
kernel/loongarch64/cgemv_t_8_lasx.S
Normal file
342
kernel/loongarch64/cgemv_t_8_lasx.S
Normal file
@@ -0,0 +1,342 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2022/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define PY0 $r14
|
||||
#define X_ORG $r15
|
||||
#define PY1 $r16
|
||||
#define K_LDA $r17
|
||||
#define PY2 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
#define M8 $r30
|
||||
|
||||
#define VALPHA $xr0
|
||||
#define X0 $xr1
|
||||
#define X1 $xr2
|
||||
#define A0 $xr3
|
||||
#define A1 $xr4
|
||||
#define A2 $xr5
|
||||
#define A3 $xr6
|
||||
#define A4 $xr7
|
||||
#define A5 $xr8
|
||||
#define A6 $xr9
|
||||
#define A7 $xr10
|
||||
#define A8 $xr11
|
||||
#define A9 $xr12
|
||||
#define A10 $xr13
|
||||
#define A11 $xr14
|
||||
#define A12 $xr15
|
||||
#define A13 $xr16
|
||||
#define A14 $xr17
|
||||
#define A15 $xr18
|
||||
#define TP0 $xr19
|
||||
#define TP1 $xr20
|
||||
#define TP2 $xr21
|
||||
#define TP3 $xr22
|
||||
#define TP4 $xr23
|
||||
#define TP5 $xr24
|
||||
#define TP6 $xr25
|
||||
#define TP7 $xr26
|
||||
#define TMP0 $xr27
|
||||
#define TMP1 $xr28
|
||||
#define TMP2 $xr29
|
||||
#define Y0 $xr3
|
||||
#define Y1 $xr4
|
||||
#define Y2 $xr5
|
||||
#define Y3 $xr6
|
||||
#define Y4 $xr7
|
||||
#define Y5 $xr8
|
||||
#define Y6 $xr9
|
||||
#define Y7 $xr10
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define GXCONJ1 0
|
||||
#define GCONJ1 0
|
||||
#else
|
||||
#define GXCONJ1 1
|
||||
#define GCONJ1 0
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 0
|
||||
#else
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 1
|
||||
#endif
|
||||
|
||||
.macro ZERO_Y8
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
|
||||
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y1
|
||||
GXOR xv, v, TP0, TP0, TP0
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X8
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X8_GAP
|
||||
fld.d $f1, X, 0x00
|
||||
fldx.d $f2, X, INC_X
|
||||
PTR_ALSL T0, INC_X, X, 1
|
||||
fld.d $f3, T0, 0x00
|
||||
fldx.d $f4, T0, INC_X
|
||||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
|
||||
PTR_ALSL T0, INC_X, X, 2
|
||||
fld.d $f2, T0, 0x00
|
||||
fldx.d $f3, T0, INC_X
|
||||
PTR_ALSL T0, INC_X, T0, 1
|
||||
fld.d $f4, T0, 0x00
|
||||
fldx.d $f5, T0, INC_X
|
||||
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
|
||||
.endm
|
||||
|
||||
.macro CGEMV_T_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
||||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
||||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \
|
||||
TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \
|
||||
TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
|
||||
TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
|
||||
TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CGEMV_T_LASX XW:req, X8:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
ZERO_Y8
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
CLOAD_\X8
|
||||
CGEMV_T_8x8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
// Accumulated
|
||||
GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
|
||||
Y5, TP5, Y6, TP6, Y7, TP7
|
||||
andi I, M, 7
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
fld.d $f1, X, 0x00
|
||||
fld.d $f11, PA0, 0x00
|
||||
fld.d $f12, PA1, 0x00
|
||||
fld.d $f13, PA2, 0x00
|
||||
fld.d $f14, PA3, 0x00
|
||||
fld.d $f15, PA4, 0x00
|
||||
fld.d $f16, PA5, 0x00
|
||||
fld.d $f17, PA6, 0x00
|
||||
fld.d $f18, PA7, 0x00
|
||||
#if __loongarch_grlen == 64
|
||||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#elif __loongarch_grlen == 32
|
||||
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#else
|
||||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#endif
|
||||
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
||||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
|
||||
A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
|
||||
A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
fld.d $f11, Y, 0x00
|
||||
fldx.d $f12, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
fld.d $f13, PY0, 0x00
|
||||
fldx.d $f14, PY0, INC_Y
|
||||
PTR_ALSL PY1, INC_Y, Y, 2
|
||||
fld.d $f15, PY1, 0x00
|
||||
fldx.d $f16, PY1, INC_Y
|
||||
PTR_ALSL PY2, INC_Y, PY1, 1
|
||||
fld.d $f17, PY2, 0x00
|
||||
fldx.d $f18, PY2, INC_Y
|
||||
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
||||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
|
||||
A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
|
||||
A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
fst.d $f11, Y, 0x00
|
||||
fstx.d $f12, Y, INC_Y
|
||||
fst.d $f13, PY0, 0x00
|
||||
fstx.d $f14, PY0, INC_Y
|
||||
fst.d $f15, PY1, 0x00
|
||||
fstx.d $f16, PY1, INC_Y
|
||||
fst.d $f17, PY2, 0x00
|
||||
fstx.d $f18, PY2, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 7
|
||||
beqz J, .L_END
|
||||
PTR_SUB K_LDA, LDA, M8
|
||||
.L_\XW\()_N_1:
|
||||
ZERO_Y1
|
||||
move X, X_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
fld.d $f3, PA0, 0x00
|
||||
fld.d $f1, X, 0x00
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
fld.d $f3, Y, 0x00
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||
fst.d $f3, Y, 0x00
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
bnez J, .L_\XW\()_N_1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
// Init VALPHA
|
||||
xvpackev.w $xr0, $xr1, $xr0
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move X_ORG, X
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||
.L_GAP_0: /* if (incx == 1) */
|
||||
CGEMV_T_LASX GAP_0, X8
|
||||
.L_GAP_1: /* if (incx != 1) */
|
||||
CGEMV_T_LASX GAP_1, X8_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
@@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXACC: Complex accumulate the values of vector registers
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
// Note: When "pre_op = xvf && suf_op = s", in will be modified.
|
||||
//
|
||||
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
.ifeqs "\suf_op", "s"
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.else
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "vf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXACC \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXMUL: Complex multiplication, out = in0 * in1
|
||||
// xconj: default value 0.
|
||||
// if !(xconj)
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i;
|
||||
// out_i = in0_r * in1_i + in0_i * in1_r;
|
||||
// else
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i;
|
||||
// out_i = in0_r * in1_i - in0_i * in1_r;
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()mul.\suf_op \out, \tmp0, \in1
|
||||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
|
||||
// xconj: default value 0
|
||||
// conj: default value 0
|
||||
// if !(CONJ)
|
||||
// if !(XCONJ)
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
|
||||
// out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
|
||||
// else
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
|
||||
// out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
|
||||
// else
|
||||
// if !(XCONJ)
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
|
||||
// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
|
||||
// else
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
|
||||
// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
|
||||
.ifeqs "\conj", "1"
|
||||
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
xvpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
xvshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
xvpackev.d \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
vshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
vpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
vshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
vpackev.d \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.endif /* pre_op = xvf */
|
||||
.else
|
||||
\pre_op\()add.\suf_op \out, \tmp2, \tmp1
|
||||
.endif /* conj = 1 */
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Media Related Macros
|
||||
//
|
||||
|
||||
343
kernel/loongarch64/zgemv_n_4_lasx.S
Normal file
343
kernel/loongarch64/zgemv_n_4_lasx.S
Normal file
@@ -0,0 +1,343 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define Y_ORG $r15
|
||||
#define OFFSET $r16
|
||||
#define K_LDA $r17
|
||||
#define M16 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
|
||||
#define VALPHA $xr1
|
||||
#define X0 $xr2
|
||||
#define X1 $xr3
|
||||
#define X2 $xr4
|
||||
#define X3 $xr5
|
||||
#define X4 $xr6
|
||||
#define X5 $xr7
|
||||
#define X6 $xr8
|
||||
#define X7 $xr9
|
||||
#define Y0 $xr10
|
||||
#define Y1 $xr11
|
||||
#define A0 $xr12
|
||||
#define A1 $xr13
|
||||
#define A2 $xr14
|
||||
#define A3 $xr15
|
||||
#define A4 $xr16
|
||||
#define A5 $xr17
|
||||
#define A6 $xr18
|
||||
#define A7 $xr19
|
||||
#define A8 $xr20
|
||||
#define A9 $xr21
|
||||
#define A10 $xr22
|
||||
#define A11 $xr23
|
||||
#define A12 $xr24
|
||||
#define A13 $xr25
|
||||
#define A14 $xr26
|
||||
#define A15 $xr27
|
||||
#define TMP0 $xr28
|
||||
#define TMP1 $xr29
|
||||
#define TMP2 $xr30
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 0
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 0
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 1
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.macro ZLOAD_X_4
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
|
||||
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X_4_GAP
|
||||
xvld X0, X, 0
|
||||
xvpermi.q X0, X0, 0
|
||||
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvld X1, T0, 0
|
||||
xvpermi.q X1, X1, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X2, T0, 0
|
||||
xvpermi.q X2, X2, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X3, T0, 0
|
||||
xvpermi.q X3, X3, 0
|
||||
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_4
|
||||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_4_GAP
|
||||
vld $vr10, Y, 0
|
||||
vldx $vr13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
vld $vr11, T0, 0
|
||||
vldx $vr14, T0, INC_Y
|
||||
GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_4x4
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_4
|
||||
GST xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_4_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
xvstelm.d Y0, Y, 0x08, 1
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
xvstelm.d Y0, T0, 0x08, 3
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 0
|
||||
xvstelm.d Y1, T0, 0x08, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 2
|
||||
xvstelm.d Y1, T0, 0x08, 3
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_1
|
||||
vld $vr10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_1x4
|
||||
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_1
|
||||
vst $vr10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X_1
|
||||
GLD xv, , X0, X, 0x00
|
||||
GPERMI xv, q, X0, X0, 0
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_1x1
|
||||
GLD_INC v, , 0x10, $vr12, PA0, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
|
||||
PTR_SRLI J, N, 2
|
||||
beqz J, .L_\XW\()_N_3
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M16
|
||||
.L_\XW\()_N_L4:
|
||||
ZLOAD_\X_4
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 2
|
||||
beqz I, .L_\XW\()_M_3
|
||||
.align 5
|
||||
.L_\XW\()_M_L4:
|
||||
ZLOAD_\Y_4
|
||||
ZGEMV_N_4x4
|
||||
ZSTORE_\Y_4
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
PTR_ADDI K, K, 4
|
||||
bnez I, .L_\XW\()_M_L4
|
||||
.L_\XW\()_M_3:
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
ZLOAD_\Y_1
|
||||
ZGEMV_N_1x4
|
||||
ZSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
bnez J, .L_\XW\()_N_L4
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 3
|
||||
beqz J, .L_END
|
||||
.L_\XW\()_N_L1:
|
||||
ZLOAD_\X_1
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
ZLOAD_\Y_1
|
||||
ZGEMV_N_1x1
|
||||
ZSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_SUB K_LDA, LDA, M16
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez J, .L_\XW\()_N_L1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||
PTR_ALSL I, I, J, 1
|
||||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||
// Init VALPHA
|
||||
xvpackev.d $xr0, $xr1, $xr0
|
||||
xvreplve0.q VALPHA, $xr0
|
||||
move Y_ORG, Y
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0 // Obtain the offset address
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||
ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
|
||||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||
ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
|
||||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||
ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
|
||||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
||||
299
kernel/loongarch64/zgemv_t_4_lasx.S
Normal file
299
kernel/loongarch64/zgemv_t_4_lasx.S
Normal file
@@ -0,0 +1,299 @@
|
||||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define PY0 $r14
|
||||
#define X_ORG $r15
|
||||
#define PY1 $r16
|
||||
#define K_LDA $r17
|
||||
#define PY2 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
#define M16 $r30
|
||||
|
||||
#define VALPHA $xr0
|
||||
#define X0 $xr1
|
||||
#define X1 $xr2
|
||||
#define A0 $xr3
|
||||
#define A1 $xr4
|
||||
#define A2 $xr5
|
||||
#define A3 $xr6
|
||||
#define A4 $xr7
|
||||
#define A5 $xr8
|
||||
#define A6 $xr9
|
||||
#define A7 $xr10
|
||||
#define A8 $xr11
|
||||
#define A9 $xr12
|
||||
#define A10 $xr13
|
||||
#define A11 $xr14
|
||||
#define A12 $xr15
|
||||
#define A13 $xr16
|
||||
#define A14 $xr17
|
||||
#define A15 $xr18
|
||||
#define TP0 $xr19
|
||||
#define TP1 $xr20
|
||||
#define TP2 $xr21
|
||||
#define TP3 $xr22
|
||||
#define TP4 $xr23
|
||||
#define TP5 $xr24
|
||||
#define TP6 $xr25
|
||||
#define TP7 $xr26
|
||||
#define TMP0 $xr27
|
||||
#define TMP1 $xr28
|
||||
#define TMP2 $xr29
|
||||
#define Y0 $xr3
|
||||
#define Y1 $xr4
|
||||
#define Y2 $xr5
|
||||
#define Y3 $xr6
|
||||
#define Y4 $xr7
|
||||
#define Y5 $xr8
|
||||
#define Y6 $xr9
|
||||
#define Y7 $xr10
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define GXCONJ1 0
|
||||
#define GCONJ1 0
|
||||
#else
|
||||
#define GXCONJ1 1
|
||||
#define GCONJ1 0
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 0
|
||||
#else
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 1
|
||||
#endif
|
||||
|
||||
.macro ZERO_Y4
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y1
|
||||
GXOR xv, v, TP0, TP0, TP0
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X4
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X4_GAP
|
||||
xvld X0, X, 0
|
||||
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvld A0, T0, 0
|
||||
xvpermi.q X0, A0, 0x02
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X1, T0, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld A0, T0, 0
|
||||
xvpermi.q X1, A0, 0x02
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_T_4x4
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
||||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
||||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_T_LASX XW:req, X4:req
|
||||
PTR_SRLI J, N, 2
|
||||
beqz J, .L_\XW\()_N_3
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M16
|
||||
.L_\XW\()_N_L4:
|
||||
ZERO_Y4
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 2
|
||||
beqz I, .L_\XW\()_M_3
|
||||
.align 5
|
||||
.L_\XW\()_M_L4:
|
||||
ZLOAD_\X4
|
||||
ZGEMV_T_4x4
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
bnez I, .L_\XW\()_M_L4
|
||||
.L_\XW\()_M_3:
|
||||
// Accumulated
|
||||
GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
|
||||
#if __loongarch_grlen == 64
|
||||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#elif __loongarch_grlen == 32
|
||||
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#else
|
||||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#endif
|
||||
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
||||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
xvld A8, Y, 0x00
|
||||
xvldx A9, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
xvld A10, PY0, 0x00
|
||||
xvldx A11, PY0, INC_Y
|
||||
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
||||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
vst $vr11, Y, 0x00
|
||||
vstx $vr12, Y, INC_Y
|
||||
vst $vr13, PY0, 0x00
|
||||
vstx $vr14, PY0, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
bnez J, .L_\XW\()_N_L4
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 3
|
||||
beqz J, .L_END
|
||||
PTR_SUB K_LDA, LDA, M16
|
||||
.L_\XW\()_N_1:
|
||||
ZERO_Y1
|
||||
move X, X_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
GLD xv, , A0, PA0, 0x00, X0, X, 0x00
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
PTR_ADDI PA0, PA0, 0x10
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
xvld A0, Y, 0x00
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||
vst $vr3, Y, 0x00
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
bnez J, .L_\XW\()_N_1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||
// Init VALPHA
|
||||
xvpackev.d $xr0, $xr1, $xr0
|
||||
xvreplve0.q VALPHA, $xr0
|
||||
move X_ORG, X
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||
.L_GAP_0: /* if (incx == 1) */
|
||||
ZGEMV_T_LASX GAP_0, X4
|
||||
.L_GAP_1: /* if (incx != 1) */
|
||||
ZGEMV_T_LASX GAP_1, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
@@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) __riscv_vsetvl_e32m8(n)
|
||||
#define FLOAT_V_T vfloat32m8_t
|
||||
#define FLOAT_V_M1_T vfloat32m1_t
|
||||
#define VLEV_FLOAT __riscv_vle32_v_f32m8
|
||||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8
|
||||
#define VSEV_FLOAT __riscv_vse32_v_f32m8
|
||||
#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1
|
||||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
||||
#else
|
||||
#define VSETVL(n) __riscv_vsetvl_e64m8(n)
|
||||
#define FLOAT_V_T vfloat64m8_t
|
||||
#define FLOAT_V_M1_T vfloat64m1_t
|
||||
#define VLEV_FLOAT __riscv_vle64_v_f64m8
|
||||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8
|
||||
#define VSEV_FLOAT __riscv_vse64_v_f64m8
|
||||
#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1
|
||||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8
|
||||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
|
||||
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
|
||||
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
|
||||
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
@@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
VSEV_FLOAT(y, vy, vl);
|
||||
}
|
||||
|
||||
} else if (1 == inc_x) {
|
||||
} else if (1 == inc_x && 0 != inc_y) {
|
||||
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
@@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
VSSEV_FLOAT(y, stride_y, vy, vl);
|
||||
}
|
||||
|
||||
} else {
|
||||
} else if( 0 == inc_y ) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
size_t in_vl = VSETVL(n);
|
||||
vy = VFMVVF_FLOAT( y[0], in_vl );
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
vx = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
|
||||
}
|
||||
FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 );
|
||||
vres = VFREDSUMVS_FLOAT( vy, vres, in_vl );
|
||||
VSEV_FLOAT_M1(y, vres, 1);
|
||||
} else {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
BLASLONG stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
|
||||
@@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
|
||||
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
|
||||
#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _)
|
||||
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
|
||||
#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
|
||||
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
|
||||
|
||||
#ifdef RISCV_0p10_INTRINSICS
|
||||
#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
|
||||
#else
|
||||
#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
@@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
VSEV_FLOAT(&y[j], vy0, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
}else if(inc_x == 1){
|
||||
} else if (1 == inc_x && 0 != inc_y) {
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
gvl = VSETVL(n);
|
||||
if(gvl <= n/2){
|
||||
@@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||
VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
|
||||
j += gvl;
|
||||
}
|
||||
} else if( 0 == inc_y ) {
|
||||
BLASLONG stride_x = inc_x * sizeof(FLOAT);
|
||||
size_t in_vl = VSETVL(n);
|
||||
vy0 = VFMVVF_FLOAT( y[0], in_vl );
|
||||
|
||||
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
|
||||
vl = VSETVL(n);
|
||||
vx0 = VLSEV_FLOAT(x, stride_x, vl);
|
||||
vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl);
|
||||
}
|
||||
FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 );
|
||||
v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl );
|
||||
y[0] = EXTRACT_FLOAT(v_res);
|
||||
}else{
|
||||
stride_x = inc_x * sizeof(FLOAT);
|
||||
stride_y = inc_y * sizeof(FLOAT);
|
||||
|
||||
@@ -101,8 +101,10 @@ SCLAUX = la_constants.o \
|
||||
slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \
|
||||
slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \
|
||||
ssteqr.o ssterf.o slaisnan.o sisnan.o \
|
||||
slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \
|
||||
../INSTALL/second_$(TIMER).o
|
||||
slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o
|
||||
ifneq ($(F_COMPILER), IBM)
|
||||
SCLAUX += ../INSTALL/second_$(TIMER).o
|
||||
endif
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
|
||||
@@ -124,7 +126,10 @@ DZLAUX = la_constants.o\
|
||||
dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \
|
||||
dsteqr.o dsterf.o dlaisnan.o disnan.o \
|
||||
dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \
|
||||
../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o
|
||||
../INSTALL/dlamch.o
|
||||
ifneq ($(F_COMPILER), IBM)
|
||||
DZLAUX += ../INSTALL/dsecnd_$(TIMER).o
|
||||
endif
|
||||
endif
|
||||
|
||||
#ifeq ($(BUILD_SINGLE),1)
|
||||
|
||||
@@ -107,6 +107,12 @@ set(ZDMDEIGTST zchkdmd.f90)
|
||||
macro(add_eig_executable name)
|
||||
add_executable(${name} ${ARGN})
|
||||
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
|
||||
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(${name} omp pthread)
|
||||
endif()
|
||||
|
||||
#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
|
||||
endmacro()
|
||||
|
||||
|
||||
@@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
|
||||
macro(add_lin_executable name)
|
||||
add_executable(${name} ${ARGN})
|
||||
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(${name} omp pthread)
|
||||
endif()
|
||||
#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
|
||||
endmacro()
|
||||
|
||||
|
||||
12
param.h
12
param.h
@@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||
#else
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
#define DGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 16
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
#endif
|
||||
|
||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||
|
||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 8
|
||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||
|
||||
#define SGEMM_DEFAULT_P 256
|
||||
|
||||
@@ -21,10 +21,14 @@ endif()
|
||||
if (BUILD_COMPLEX16)
|
||||
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
|
||||
endif()
|
||||
|
||||
message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
|
||||
foreach(test_bin ${OpenBLAS_Tests})
|
||||
add_executable(${test_bin} ${test_bin}.f)
|
||||
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
|
||||
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
|
||||
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
|
||||
target_link_libraries(${test_bin} omp pthread)
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# $1 exec, $2 input, $3 output_result
|
||||
|
||||
Reference in New Issue
Block a user