Merge branch 'develop' into issue4468

This commit is contained in:
Martin Kroeker 2024-02-23 11:39:49 +01:00 committed by GitHub
commit 2e86faa657
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
45 changed files with 22138 additions and 212 deletions

View File

@ -1,44 +1,44 @@
macos_instance: macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest image: ghcr.io/cirruslabs/macos-monterey-xcode:latest
task: #task:
name: AppleM1/LLVM # name: AppleM1/LLVM
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang # - make TARGET=VORTEX USE_OPENMP=1 CC=clang
task: #task:
name: AppleM1/LLVM/ILP64 # name: AppleM1/LLVM/ILP64
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1 # - make TARGET=VORTEX USE_OPENMP=1 CC=clang INTERFACE64=1
task: #task:
name: AppleM1/LLVM/CMAKE # name: AppleM1/LLVM/CMAKE
compile_script: # compile_script:
- brew install llvm # - brew install llvm
- export PATH=/opt/homebrew/opt/llvm/bin:$PATH # - export PATH=/opt/homebrew/opt/llvm/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/opt/llvm/lib" # - export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
- export CPPFLAGS="-I/opt/homebrew/opt/llvm/include" # - export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
- mkdir build # - mkdir build
- cd build # - cd build
- cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. # - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON ..
- make -j 4 # - make -j 4
task: #task:
name: AppleM1/GCC/MAKE/OPENMP # name: AppleM1/GCC/MAKE/OPENMP
compile_script: # compile_script:
- brew install gcc@11 # - brew install gcc@11
- export PATH=/opt/homebrew/bin:$PATH # - export PATH=/opt/homebrew/bin:$PATH
- export LDFLAGS="-L/opt/homebrew/lib" # - export LDFLAGS="-L/opt/homebrew/lib"
- export CPPFLAGS="-I/opt/homebrew/include" # - export CPPFLAGS="-I/opt/homebrew/include"
- make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 # - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1
macos_instance: macos_instance:
image: ghcr.io/cirruslabs/macos-monterey-xcode:latest image: ghcr.io/cirruslabs/macos-monterey-xcode:latest

149
.github/workflows/apple_m.yml vendored Normal file
View File

@ -0,0 +1,149 @@
name: apple m
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
build:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: macos-14
strategy:
fail-fast: false
matrix:
build: [cmake, make]
fortran: [gfortran]
openmp: [0, 1]
ilp64: [0, 1]
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: Print system information
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
cat /proc/cpuinfo
elif [ "$RUNNER_OS" == "macOS" ]; then
sysctl -a | grep machdep.cpu
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Install Dependencies
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install -y gfortran cmake ccache libtinfo5
elif [ "$RUNNER_OS" == "macOS" ]; then
# It looks like "gfortran" isn't working correctly unless "gcc" is re-installed.
brew reinstall gcc
brew install coreutils cmake ccache
brew install llvm
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
# We include the commit sha in the cache key, as new cache entries are
# only created if there is no existing entry for the key yet.
# GNU make and cmake call the compilers differently. It looks like
# that causes the cache to mismatch. Keep the ccache for both build
# tools separate to avoid polluting each other.
key: ccache-${{ runner.os }}-${{ matrix.build }}-${{ matrix.fortran }}-${{ github.ref }}-${{ github.sha }}
# Restore a matching ccache cache entry. Prefer same branch and same Fortran compiler.
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.build }}-${{matrix.fortran }}
ccache-${{ runner.os }}-${{ matrix.build }}
- name: Configure ccache
run: |
if [ "${{ matrix.build }}" = "make" ]; then
# Add ccache to path
if [ "$RUNNER_OS" = "Linux" ]; then
echo "/usr/lib/ccache" >> $GITHUB_PATH
elif [ "$RUNNER_OS" = "macOS" ]; then
echo "$(brew --prefix)/opt/ccache/libexec" >> $GITHUB_PATH
echo "/opt/homebrew/opt/llvm/bin" >>$GITHUB_PATH
echo "" >>$GITHUB_PATH
else
echo "::error::$RUNNER_OS not supported"
exit 1
fi
fi
# Limit the maximum size and switch on compression to avoid exceeding the total disk or cache quota (5 GB).
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: Build OpenBLAS
run: |
export LDFLAGS="-L/opt/homebrew/opt/llvm/lib"
export CPPFLAGS="-I/opt/homebrew/opt/llvm/include"
export CC="/opt/homebrew/opt/llvm/bin/clang"
case "${{ matrix.build }}" in
"make")
make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=${{matrix.openmp}} INTERFACE64=${{matrix.ilp64}} FC="ccache ${{ matrix.fortran }}"
;;
"cmake")
export LDFLAGS="$LDFLAGS -Wl,-ld_classic"
mkdir build && cd build
cmake -DDYNAMIC_ARCH=1 \
-DUSE_OPENMP=${{matrix.openmp}} \
-DINTERFACE64=${{matrix.ilp64}} \
-DNOFORTRAN=0 \
-DBUILD_WITHOUT_LAPACK=0 \
-DCMAKE_VERBOSE_MAKEFILE=ON \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_Fortran_COMPILER=${{ matrix.fortran }} \
-DCMAKE_C_COMPILER_LAUNCHER=ccache \
-DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \
..
cmake --build .
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac
- name: Show ccache status
continue-on-error: true
run: ccache -s
- name: Run tests
timeout-minutes: 60
run: |
case "${{ matrix.build }}" in
"make")
MAKE_FLAGS='DYNAMIC_ARCH=1 USE_OPENMP=0'
echo "::group::Tests in 'test' directory"
make -C test $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'ctest' directory"
make -C ctest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
echo "::group::Tests in 'utest' directory"
make -C utest $MAKE_FLAGS FC="ccache ${{ matrix.fortran }}"
echo "::endgroup::"
;;
"cmake")
cd build && ctest
;;
*)
echo "::error::Configuration not supported"
exit 1
;;
esac

253
.github/workflows/riscv64_vector.yml vendored Normal file
View File

@ -0,0 +1,253 @@
name: riscv64 zvl256b qemu test
on: [push, pull_request]
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true
permissions:
contents: read # to fetch code (actions/checkout)
jobs:
TEST:
if: "github.repository == 'OpenMathLib/OpenBLAS'"
runs-on: ubuntu-latest
env:
triple: riscv64-unknown-linux-gnu
riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
riscv_gnu_toolchain_version: 13.2.0
riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
strategy:
fail-fast: false
matrix:
include:
- target: RISCV64_ZVL128B
opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
- target: RISCV64_ZVL256B
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make \
libgomp1-riscv64-cross ccache
wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS libs
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
- name: build OpenBLAS tests
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
- name: build lapack-netlib tests
working-directory: ./lapack-netlib/TESTING
run: |
export PATH="/opt/riscv/bin:$PATH"
make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
CC='${triple}-gcc' \
AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
RANLIB='ccache ${triple}-ranlib' \
FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
- name: OpenBLAS tests
shell: bash
run: |
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \
else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
}
run_test test cblat1 &
run_test test cblat2 cblat2.dat &
run_test test cblat3 cblat3.dat &
run_test test dblat1 &
run_test test dblat2 dblat2.dat &
run_test test dblat3 dblat3.dat &
run_test test sblat1 &
run_test test sblat2 sblat2.dat &
run_test test sblat3 sblat3.dat &
run_test test zblat1 &
run_test test zblat2 zblat2.dat &
run_test test zblat3 zblat3.dat &
run_test ctest xccblat1 &
run_test ctest xccblat2 cin2 &
run_test ctest xccblat3 cin3 &
run_test ctest xdcblat1 &
run_test ctest xdcblat2 din2 &
run_test ctest xdcblat3 din3 &
run_test ctest xscblat1 &
run_test ctest xscblat2 sin2 &
run_test ctest xscblat3 sin3 &
run_test ctest xzcblat1 &
run_test ctest xzcblat2 zin2 &
run_test ctest xzcblat3 zin3 &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
- name: netlib tests
shell: bash
run: |
: # these take a very long time
echo "Skipping netlib tests in CI"
exit 0
: # comment out exit above to enable the tests
: # probably we want to identify a subset to run in CI
export PATH="/opt/riscv/bin:$PATH"
export QEMU_CPU=${{ matrix.qemu_cpu }}
rm -rf ./test_out
mkdir -p ./test_out
run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
echo "$4" >> $OUTPUT; \
echo "$CMD" >> $OUTPUT; \
qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
}
run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines" &
run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines" &
run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines" &
run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines" &
run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines" &
run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines" &
run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines" &
run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines" &
run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines" &
run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines" &
run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines" &
run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines" &
run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver" &
run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines" &
run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines" &
run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix" &
run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix" &
run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices" &
run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices" &
run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines" &
run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines" &
run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines" &
run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines" &
run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines" &
run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines" &
run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines" &
run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver" &
run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines" &
run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix" &
run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix" &
run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices" &
run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines" &
run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines" &
run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines" &
run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines" &
run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines" &
run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines" &
run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines" &
run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver" &
run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines" &
run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines" &
run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix" &
run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix" &
run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices" &
run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices" &
run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines" &
run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines" &
run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines" &
run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines" &
run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines" &
run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines" &
run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines" &
run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines" &
run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines" &
run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver" &
run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines" &
run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines" &
run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines" &
run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines" &
run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix" &
run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix" &
run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices" &
run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices" &
run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines" &
run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines" &
run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines" &
run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines" &
run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines" &
run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines" &
wait
while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
NUMERICAL_ERRORS=-1
OTHER_ERRORS=-1
. <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi

View File

@ -219,6 +219,7 @@ In chronological order:
* Mark Seminatore <https://github.com/mseminatore> * Mark Seminatore <https://github.com/mseminatore>
* [2023-11-09] Improve Windows threading performance scaling * [2023-11-09] Improve Windows threading performance scaling
* [2024-02-09] Introduce MT_TRACE facility and improve code consistency
* Dirreke <https://github.com/mseminatore> * Dirreke <https://github.com/mseminatore>
* [2024-01-16] Add basic support for the CSKY architecture * [2024-01-16] Add basic support for the CSKY architecture

View File

@ -156,6 +156,9 @@ endif
ifeq ($(OSNAME), CYGWIN_NT) ifeq ($(OSNAME), CYGWIN_NT)
@$(MAKE) -C exports dll @$(MAKE) -C exports dll
endif endif
ifeq ($(OSNAME), AIX)
@$(MAKE) -C exports so
endif
endif endif
tests : shared tests : shared

View File

@ -1715,11 +1715,7 @@ endif
LIBDLLNAME = $(LIBPREFIX).dll LIBDLLNAME = $(LIBPREFIX).dll
IMPLIBNAME = lib$(LIBNAMEBASE).dll.a IMPLIBNAME = lib$(LIBNAMEBASE).dll.a
ifneq ($(OSNAME), AIX)
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so) LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.so)
else
LIBSONAME = $(LIBNAME:.$(LIBSUFFIX)=.a)
endif
LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib) LIBDYNNAME = $(LIBNAME:.$(LIBSUFFIX)=.dylib)
LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def) LIBDEFNAME = $(LIBNAME:.$(LIBSUFFIX)=.def)
LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp) LIBEXPNAME = $(LIBNAME:.$(LIBSUFFIX)=.exp)

View File

@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
``` ```
- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
e.g.:
```sh
make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
HOSTCC=gcc HOSTFC=gfortran -j
```
### Support for multiple targets in a single library ### Support for multiple targets in a single library
OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.

View File

@ -64,6 +64,7 @@ else ()
"#define NEEDBUNDERSCORE 1\n") "#define NEEDBUNDERSCORE 1\n")
endif() endif()
if (CMAKE_Fortran_COMPILER)
get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE) get_filename_component(F_COMPILER ${CMAKE_Fortran_COMPILER} NAME_WE)
string(TOUPPER ${F_COMPILER} F_COMPILER) string(TOUPPER ${F_COMPILER} F_COMPILER)
endif()

View File

@ -6,9 +6,6 @@
if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang")
# This is for classic Flang. LLVM Flang is handled with gfortran below. # This is for classic Flang. LLVM Flang is handled with gfortran below.
set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG")
if (BINARY64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -i8")
endif ()
if (USE_OPENMP) if (USE_OPENMP)
set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp")
endif () endif ()
@ -55,6 +52,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
if (MIPS64) if (MIPS64)
if (BINARY64) if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=64")
if (INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else () else ()
set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32")
endif () endif ()
@ -83,6 +83,9 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
endif () endif ()
endif () endif ()
endif () endif ()
if (ARM64 AND INTERFACE64)
set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8")
endif ()
else () else ()
if (BINARY64) if (BINARY64)
set(FCOMMON_OPT "${FCOMMON_OPT} -m64") set(FCOMMON_OPT "${FCOMMON_OPT} -m64")

View File

@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define BUFFER_SIZE ( 32 << 20) #define BUFFER_SIZE ( 32 << 20)
#define SEEK_ADDRESS #define SEEK_ADDRESS
#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280) #if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
# include <riscv_vector.h> # include <riscv_vector.h>
#endif #endif

View File

@ -40,6 +40,10 @@ else()
c_${float_char}blas1.c) c_${float_char}blas1.c)
endif() endif()
target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat1 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat1 m) target_link_libraries(x${float_char}cblat1 m)
endif() endif()
@ -65,6 +69,10 @@ else()
constant.c) constant.c)
endif() endif()
target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat2 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat2 m) target_link_libraries(x${float_char}cblat2 m)
endif() endif()
@ -90,6 +98,10 @@ else()
constant.c) constant.c)
endif() endif()
target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(x${float_char}cblat3 omp pthread)
endif()
if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX") if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD" OR ${CMAKE_SYSTEM_NAME} MATCHES "QNX")
target_link_libraries(x${float_char}cblat3 m) target_link_libraries(x${float_char}cblat3 m)
endif() endif()

View File

@ -48,6 +48,12 @@
#endif #endif
#endif #endif
#ifdef SMP_DEBUG
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
#else
# define MT_TRACE(...)
#endif
/* This is a thread implementation for Win32 lazy implementation */ /* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */ /* Thread server common information */
@ -68,19 +74,12 @@ static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER];
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
#if defined (__GNUC__) && (__GNUC__ < 6) //
#define WIN_CAS(dest, exch, comp) __sync_val_compare_and_swap(dest, comp, exch) // Legacy code path
#else //
#if defined(_WIN64) static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange64(dest, exch, comp)
#else
#define WIN_CAS(dest, exch, comp) InterlockedCompareExchange(dest, exch, comp)
#endif
#endif
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)) {
if (!(mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* REAL / Extended Double */ /* REAL / Extended Double */
@ -95,7 +94,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if ((mode & BLAS_PREC) == BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
/* REAL / Double */ /* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -106,7 +105,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_SINGLE){ } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
/* REAL / Single */ /* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -118,7 +117,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_BFLOAT16 #ifdef BUILD_BFLOAT16
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16) {
/* REAL / BFLOAT16 */ /* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
@ -129,7 +128,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){ } else if ((mode & BLAS_PREC) == BLAS_STOBF16) {
/* REAL / BLAS_STOBF16 */ /* REAL / BLAS_STOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG,
@ -140,7 +139,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){ } else if ((mode & BLAS_PREC) == BLAS_DTOBF16) {
/* REAL / BLAS_DTOBF16 */ /* REAL / BLAS_DTOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG,
@ -157,7 +156,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if ((mode & BLAS_PREC) == BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE) {
/* COMPLEX / Extended Double */ /* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -171,7 +170,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if ((mode & BLAS_PREC) == BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
/* COMPLEX / Double */ /* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -201,10 +200,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
} }
} }
/* This is a main routine of threads. Each thread waits until job is */ //
/* queued. */ // This is a main routine of threads. Each thread waits until job is queued.
//
static DWORD WINAPI blas_thread_server(void *arg){ static DWORD WINAPI blas_thread_server(void *arg) {
/* Thread identifier */ /* Thread identifier */
BLASLONG cpu = (BLASLONG)arg; BLASLONG cpu = (BLASLONG)arg;
@ -215,31 +214,24 @@ static DWORD WINAPI blas_thread_server(void *arg){
/* Each server needs each buffer */ /* Each server needs each buffer */
buffer = blas_memory_alloc(2); buffer = blas_memory_alloc(2);
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
while (1){ while (1) {
/* Waiting for Queue */ /* Waiting for Queue */
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
// event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE);
if (cpu > thread_target - 2) // event raised when work is added to the queue
{ WaitForSingleObject(kickoff_event, INFINITE);
//printf("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits
}
#ifdef SMP_DEBUG if (cpu > thread_target - 2) {
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu); //MT_TRACE("thread [%d] exiting.\n", cpu);
#endif break; // excess thread, so worker thread exits
}
MT_TRACE("Server[%2ld] Got it.\n", cpu);
#if 1
EnterCriticalSection(&queue_lock); EnterCriticalSection(&queue_lock);
queue = work_queue; queue = work_queue;
@ -247,53 +239,39 @@ static DWORD WINAPI blas_thread_server(void *arg){
work_queue = work_queue->next; work_queue = work_queue->next;
LeaveCriticalSection(&queue_lock); LeaveCriticalSection(&queue_lock);
#else
volatile blas_queue_t* queue_next;
INT_PTR prev_value; if (queue) {
do {
queue = (volatile blas_queue_t*)work_queue;
if (!queue)
break;
queue_next = (volatile blas_queue_t*)queue->next;
prev_value = WIN_CAS((INT_PTR*)&work_queue, (INT_PTR)queue_next, (INT_PTR)queue);
} while (prev_value != queue);
#endif
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
sa = queue -> sa; sa = queue -> sa;
sb = queue -> sb; sb = queue -> sb;
#ifdef CONSISTENT_FPCSR #ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode)); __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode)); __asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif #endif
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k); cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
// fprintf(stderr, "queue start[%ld]!!!\n", cpu); // fprintf(stderr, "queue start[%ld]!!!\n", cpu);
#ifdef MONITOR #ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1; main_status[cpu] = MAIN_RUNNING1;
#endif #endif
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); if (sa == NULL)
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){ if (!(queue -> mode & BLAS_COMPLEX)) {
#ifdef EXPRECISION #ifdef EXPRECISION
if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE) {
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
#ifdef BUILD_DOUBLE #ifdef BUILD_DOUBLE
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
@ -327,65 +305,58 @@ static DWORD WINAPI blas_thread_server(void *arg){
/* Other types in future */ /* Other types in future */
} }
} }
queue->sb=sb; queue->sb=sb;
} }
#ifdef MONITOR #ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2; main_status[cpu] = MAIN_RUNNING2;
#endif #endif
if (!(queue -> mode & BLAS_LEGACY)) { if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
} else { } else {
legacy_exec(routine, queue -> mode, queue -> args, sb); legacy_exec(routine, queue -> mode, queue -> args, sb);
} }
}else{ } else {
continue; //if queue == NULL continue; //if queue == NULL
} }
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Finished!\n", cpu);
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
queue->finished = 1;
queue->finished = 1;
} }
/* Shutdown procedure */ /* Shutdown procedure */
#ifdef SMP_DEBUG MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
blas_memory_free(buffer); blas_memory_free(buffer);
return 0; return 0;
} }
/* Initializing routine */ //
int blas_thread_init(void){ // Initializing routine
//
int blas_thread_init(void) {
BLASLONG i; BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0; if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
LOCK_COMMAND(&server_lock); LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
if (!blas_server_avail){ if (!blas_server_avail) {
// create the kickoff Event // create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
thread_target = blas_cpu_number; thread_target = blas_cpu_number;
InitializeCriticalSection(&queue_lock); InitializeCriticalSection(&queue_lock);
for(i = 0; i < blas_cpu_number - 1; i++){ for(i = 0; i < blas_cpu_number - 1; i++) {
//printf("thread_init: creating thread [%d]\n", i); //MT_TRACE("thread_init: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0, blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i, blas_thread_server, (void *)i,
@ -400,15 +371,12 @@ int blas_thread_init(void){
return 0; return 0;
} }
/* //
User can call one of two routines. // User can call one of two routines.
// exec_blas_async ... immediately returns after jobs are queued.
exec_blas_async ... immediately returns after jobs are queued. // exec_blas ... returns after jobs are finished.
//
exec_blas ... returns after jobs are finished. int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
*/
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER) #if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork // Handle lazy re-init of the thread-pool after a POSIX fork
@ -428,7 +396,7 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode)); __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif #endif
current->finished = 0; current->finished = 0;
current = current -> next; current = current -> next;
pos ++; pos ++;
} }
@ -437,18 +405,18 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
if (!work_queue) if (!work_queue)
{ {
work_queue = queue; work_queue = queue;
} }
else else
{ {
blas_queue_t *next_item = work_queue; blas_queue_t *next_item = work_queue;
// find the end of the work queue // find the end of the work queue
while (next_item) while (next_item)
next_item = next_item->next; next_item = next_item->next;
// add new work to the end // add new work to the end
next_item = queue; next_item = queue;
} }
LeaveCriticalSection(&queue_lock); LeaveCriticalSection(&queue_lock);
@ -458,26 +426,25 @@ int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
return 0; return 0;
} }
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){ //
// Join. Wait for all queued tasks to complete
//
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
#ifdef SMP_DEBUG MT_TRACE("Synchronization Waiting.\n");
fprintf(STDERR, "Synchronization Waiting.\n");
#endif
while (num){ while (num) {
#ifdef SMP_DEBUG MT_TRACE("Waiting Queue ..\n");
fprintf(STDERR, "Waiting Queue ..\n");
#endif
while (!queue->finished)
YIELDING;
queue = queue->next; while (!queue->finished)
num--; YIELDING;
}
queue = queue->next;
num--;
}
MT_TRACE("Completely Done.\n\n");
#ifdef SMP_DEBUG
fprintf(STDERR, "Completely Done.\n\n");
#endif
// if work was added to the queue after this batch we can't sleep the worker threads // if work was added to the queue after this batch we can't sleep the worker threads
// by resetting the event // by resetting the event
EnterCriticalSection(&queue_lock); EnterCriticalSection(&queue_lock);
@ -490,8 +457,10 @@ int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
return 0; return 0;
} }
/* Execute Threads */ //
int exec_blas(BLASLONG num, blas_queue_t *queue){ // Execute Threads
//
int exec_blas(BLASLONG num, blas_queue_t *queue) {
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork // Handle lazy re-init of the thread-pool after a POSIX fork
@ -504,29 +473,33 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
if ((num <= 0) || (queue == NULL)) return 0; if ((num <= 0) || (queue == NULL)) return 0;
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next); if ((num > 1) && queue -> next)
exec_blas_async(1, queue -> next);
routine = queue -> routine; routine = queue -> routine;
if (queue -> mode & BLAS_LEGACY) { if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else } else {
if (queue -> mode & BLAS_PTHREAD) { if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine; void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args); (pthreadcompat)(queue -> args);
} else } else
(routine)(queue -> args, queue -> range_m, queue -> range_n, (routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0); queue -> sa, queue -> sb, 0);
}
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next); if ((num > 1) && queue -> next)
exec_blas_async_wait(num - 1, queue -> next);
return 0; return 0;
} }
/* Shutdown procedure, but user don't have to call this routine. The */ //
/* kernel automatically kill threads. */ // Shutdown procedure, but user don't have to call this routine. The
// kernel automatically kill threads.
int BLASFUNC(blas_thread_shutdown)(void){ //
int BLASFUNC(blas_thread_shutdown)(void) {
int i; int i;
@ -534,9 +507,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
LOCK_COMMAND(&server_lock); LOCK_COMMAND(&server_lock);
if (blas_server_avail){ if (blas_server_avail) {
for(i = 0; i < blas_num_threads - 1; i++){ for (i = 0; i < blas_num_threads - 1; i++) {
// Could also just use WaitForMultipleObjects // Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
@ -558,6 +531,9 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0; return 0;
} }
//
// Legacy function to set numbef of threads
//
void goto_set_num_threads(int num_threads) void goto_set_num_threads(int num_threads)
{ {
long i; long i;
@ -571,7 +547,7 @@ void goto_set_num_threads(int num_threads)
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (blas_server_avail && num_threads < blas_num_threads) { if (blas_server_avail && num_threads < blas_num_threads) {
LOCK_COMMAND(&server_lock); LOCK_COMMAND(&server_lock);
thread_target = num_threads; thread_target = num_threads;
@ -579,11 +555,11 @@ void goto_set_num_threads(int num_threads)
SetEvent(kickoff_event); SetEvent(kickoff_event);
for (i = num_threads - 1; i < blas_num_threads - 1; i++) { for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
//printf("set_num_threads: waiting on thread [%d] to quit.\n", i); //MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
WaitForSingleObject(blas_threads[i], INFINITE); WaitForSingleObject(blas_threads[i], INFINITE);
//printf("set_num_threads: thread [%d] has quit.\n", i); //MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
CloseHandle(blas_threads[i]); CloseHandle(blas_threads[i]);
} }
@ -601,8 +577,8 @@ void goto_set_num_threads(int num_threads)
thread_target = num_threads; thread_target = num_threads;
//increased_threads = 1; //increased_threads = 1;
if (!blas_server_avail){ if (!blas_server_avail) {
// create the kickoff Event // create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL); kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
@ -611,8 +587,8 @@ void goto_set_num_threads(int num_threads)
blas_server_avail = 1; blas_server_avail = 1;
} }
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
//printf("set_num_threads: creating thread [%d]\n", i); //MT_TRACE("set_num_threads: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0, blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i, blas_thread_server, (void *)i,
@ -627,6 +603,9 @@ void goto_set_num_threads(int num_threads)
blas_cpu_number = num_threads; blas_cpu_number = num_threads;
} }
//
// Openblas function to set thread count
//
void openblas_set_num_threads(int num) void openblas_set_num_threads(int num)
{ {
goto_set_num_threads(num); goto_set_num_threads(num);

View File

@ -73,6 +73,10 @@ endif
endif endif
endif endif
ifeq ($(F_COMPILER)$(OSNAME), IBMAIX)
EXTRALIB += -lxlf90
endif
ifeq ($(C_COMPILER), PGI) ifeq ($(C_COMPILER), PGI)
EXTRALIB += -pgf90libs EXTRALIB += -pgf90libs
endif endif
@ -256,6 +260,20 @@ endif
ifeq ($(OSNAME), AIX) ifeq ($(OSNAME), AIX)
so : ../$(LIBSONAME) linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(EXTRALIB) && echo OK.
rm -f linktest
../$(LIBSONAME) : aix.exp
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,-bE:aix.exp -Wl,-bbigtoc ../$(LIBNAME) $(EXTRALIB)
aix.exp :
/usr/bin/nm -X32_64 -PCpgl ../$(LIBNAME) | /usr/bin/awk '{ if ((($$ 2 == "T") \
|| ($$ 2 == "D") || ($$ 2 == "B") || ($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) && (substr($$ 1,1,1) != ".")) \
{ if (($$ 2 == "W") || ($$ 2 == "V") || ($$ 2 == "Z")) { print $$ 1 " weak" } else { print $$ 1 } } }' | \
/usr/bin/sort -u > aix.exp
ifeq ($(COMPILER_F77), xlf) ifeq ($(COMPILER_F77), xlf)
goto32.$(SUFFIX) : ../$(LIBNAME) aix.def goto32.$(SUFFIX) : ../$(LIBNAME) aix.def

View File

@ -90,7 +90,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#include <unistd.h> #include <unistd.h>
#endif #endif
#if defined(AIX) #if defined(_AIX)
#include <unistd.h>
#include <sys/systemcfg.h>
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#endif #endif
@ -1870,11 +1872,13 @@ static int get_num_cores(void) {
return count; return count;
#elif defined(AIX) #elif defined(_AIX)
//returns the number of processors which are currently online //returns the number of processors which are currently online
count = sysconf(_SC_NPROCESSORS_ONLN); count = sysconf(_SC_NPROCESSORS_ONLN);
if (count <= 0) count = 2; if (count <= 0) count = 2;
return count;
#else #else
return 2; return 2;
#endif #endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,587 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
BLASLONG i, j;
FLOAT *aoffset;
FLOAT *aoffset1, *aoffset2;
FLOAT *boffset;
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
aoffset = a;
boffset = b;
lda *= 2;
#if 0
fprintf(stderr, "M = %d N = %d\n", m, n);
#endif
j = (n >> 4);
if (j > 0){
do{
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 32;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
ctemp33 = *(aoffset2 + 0);
ctemp34 = *(aoffset2 + 1);
ctemp35 = *(aoffset2 + 2);
ctemp36 = *(aoffset2 + 3);
ctemp37 = *(aoffset2 + 4);
ctemp38 = *(aoffset2 + 5);
ctemp39 = *(aoffset2 + 6);
ctemp40 = *(aoffset2 + 7);
ctemp41 = *(aoffset2 + 8);
ctemp42 = *(aoffset2 + 9);
ctemp43 = *(aoffset2 + 10);
ctemp44 = *(aoffset2 + 11);
ctemp45 = *(aoffset2 + 12);
ctemp46 = *(aoffset2 + 13);
ctemp47 = *(aoffset2 + 14);
ctemp48 = *(aoffset2 + 15);
ctemp49 = *(aoffset2 + 16);
ctemp50 = *(aoffset2 + 17);
ctemp51 = *(aoffset2 + 18);
ctemp52 = *(aoffset2 + 19);
ctemp53 = *(aoffset2 + 20);
ctemp54 = *(aoffset2 + 21);
ctemp55 = *(aoffset2 + 22);
ctemp56 = *(aoffset2 + 23);
ctemp57 = *(aoffset2 + 24);
ctemp58 = *(aoffset2 + 25);
ctemp59 = *(aoffset2 + 26);
ctemp60 = *(aoffset2 + 27);
ctemp61 = *(aoffset2 + 28);
ctemp62 = *(aoffset2 + 29);
ctemp63 = *(aoffset2 + 30);
ctemp64 = *(aoffset2 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
*(boffset + 32) = -ctemp33;
*(boffset + 33) = -ctemp34;
*(boffset + 34) = -ctemp35;
*(boffset + 35) = -ctemp36;
*(boffset + 36) = -ctemp37;
*(boffset + 37) = -ctemp38;
*(boffset + 38) = -ctemp39;
*(boffset + 39) = -ctemp40;
*(boffset + 40) = -ctemp41;
*(boffset + 41) = -ctemp42;
*(boffset + 42) = -ctemp43;
*(boffset + 43) = -ctemp44;
*(boffset + 44) = -ctemp45;
*(boffset + 45) = -ctemp46;
*(boffset + 46) = -ctemp47;
*(boffset + 47) = -ctemp48;
*(boffset + 48) = -ctemp49;
*(boffset + 49) = -ctemp50;
*(boffset + 50) = -ctemp51;
*(boffset + 51) = -ctemp52;
*(boffset + 52) = -ctemp53;
*(boffset + 53) = -ctemp54;
*(boffset + 54) = -ctemp55;
*(boffset + 55) = -ctemp56;
*(boffset + 56) = -ctemp57;
*(boffset + 57) = -ctemp58;
*(boffset + 58) = -ctemp59;
*(boffset + 59) = -ctemp60;
*(boffset + 60) = -ctemp61;
*(boffset + 61) = -ctemp62;
*(boffset + 62) = -ctemp63;
*(boffset + 63) = -ctemp64;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 64;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset1 + 16);
ctemp18 = *(aoffset1 + 17);
ctemp19 = *(aoffset1 + 18);
ctemp20 = *(aoffset1 + 19);
ctemp21 = *(aoffset1 + 20);
ctemp22 = *(aoffset1 + 21);
ctemp23 = *(aoffset1 + 22);
ctemp24 = *(aoffset1 + 23);
ctemp25 = *(aoffset1 + 24);
ctemp26 = *(aoffset1 + 25);
ctemp27 = *(aoffset1 + 26);
ctemp28 = *(aoffset1 + 27);
ctemp29 = *(aoffset1 + 28);
ctemp30 = *(aoffset1 + 29);
ctemp31 = *(aoffset1 + 30);
ctemp32 = *(aoffset1 + 31);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
boffset += 32;
}
j--;
}while(j > 0);
} /* end of if(j > 0) */
if (n & 8){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 16;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
ctemp17 = *(aoffset2 + 0);
ctemp18 = *(aoffset2 + 1);
ctemp19 = *(aoffset2 + 2);
ctemp20 = *(aoffset2 + 3);
ctemp21 = *(aoffset2 + 4);
ctemp22 = *(aoffset2 + 5);
ctemp23 = *(aoffset2 + 6);
ctemp24 = *(aoffset2 + 7);
ctemp25 = *(aoffset2 + 8);
ctemp26 = *(aoffset2 + 9);
ctemp27 = *(aoffset2 + 10);
ctemp28 = *(aoffset2 + 11);
ctemp29 = *(aoffset2 + 12);
ctemp30 = *(aoffset2 + 13);
ctemp31 = *(aoffset2 + 14);
ctemp32 = *(aoffset2 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
*(boffset + 16) = -ctemp17;
*(boffset + 17) = -ctemp18;
*(boffset + 18) = -ctemp19;
*(boffset + 19) = -ctemp20;
*(boffset + 20) = -ctemp21;
*(boffset + 21) = -ctemp22;
*(boffset + 22) = -ctemp23;
*(boffset + 23) = -ctemp24;
*(boffset + 24) = -ctemp25;
*(boffset + 25) = -ctemp26;
*(boffset + 26) = -ctemp27;
*(boffset + 27) = -ctemp28;
*(boffset + 28) = -ctemp29;
*(boffset + 29) = -ctemp30;
*(boffset + 30) = -ctemp31;
*(boffset + 31) = -ctemp32;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 32;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset1 + 8);
ctemp10 = *(aoffset1 + 9);
ctemp11 = *(aoffset1 + 10);
ctemp12 = *(aoffset1 + 11);
ctemp13 = *(aoffset1 + 12);
ctemp14 = *(aoffset1 + 13);
ctemp15 = *(aoffset1 + 14);
ctemp16 = *(aoffset1 + 15);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
boffset += 16;
}
}
if (n & 4){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 8;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
ctemp09 = *(aoffset2 + 0);
ctemp10 = *(aoffset2 + 1);
ctemp11 = *(aoffset2 + 2);
ctemp12 = *(aoffset2 + 3);
ctemp13 = *(aoffset2 + 4);
ctemp14 = *(aoffset2 + 5);
ctemp15 = *(aoffset2 + 6);
ctemp16 = *(aoffset2 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
*(boffset + 8) = -ctemp09;
*(boffset + 9) = -ctemp10;
*(boffset + 10) = -ctemp11;
*(boffset + 11) = -ctemp12;
*(boffset + 12) = -ctemp13;
*(boffset + 13) = -ctemp14;
*(boffset + 14) = -ctemp15;
*(boffset + 15) = -ctemp16;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 16;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset1 + 4);
ctemp06 = *(aoffset1 + 5);
ctemp07 = *(aoffset1 + 6);
ctemp08 = *(aoffset1 + 7);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
boffset += 8;
}
}
if (n & 2){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
aoffset += 4;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
ctemp05 = *(aoffset2 + 0);
ctemp06 = *(aoffset2 + 1);
ctemp07 = *(aoffset2 + 2);
ctemp08 = *(aoffset2 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
*(boffset + 4) = -ctemp05;
*(boffset + 5) = -ctemp06;
*(boffset + 6) = -ctemp07;
*(boffset + 7) = -ctemp08;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 8;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset1 + 2);
ctemp04 = *(aoffset1 + 3);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
boffset += 4;
}
}
if (n & 1){
aoffset1 = aoffset;
aoffset2 = aoffset + lda;
// aoffset += 2;
i = (m >> 1);
if (i > 0){
do{
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
ctemp03 = *(aoffset2 + 0);
ctemp04 = *(aoffset2 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
*(boffset + 2) = -ctemp03;
*(boffset + 3) = -ctemp04;
aoffset1 += 2 * lda;
aoffset2 += 2 * lda;
boffset += 4;
i --;
}while(i > 0);
}
if (m & 1){
ctemp01 = *(aoffset1 + 0);
ctemp02 = *(aoffset1 + 1);
*(boffset + 0) = -ctemp01;
*(boffset + 1) = -ctemp02;
// boffset += 2;
}
}
return 0;
}

View File

@ -0,0 +1,333 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, offset;
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
lda *= 2;
js = (n >> 4);
while (js > 0){
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
if (offset > -8) ao9 = a + (posX + 8) * 2 + posY * lda; else ao9 = a + posY * 2 + (posX + 8) * lda;
if (offset > -9) ao10 = a + (posX + 9) * 2 + posY * lda; else ao10 = a + posY * 2 + (posX + 9) * lda;
if (offset > -10) ao11 = a + (posX + 10) * 2 + posY * lda; else ao11 = a + posY * 2 + (posX + 10) * lda;
if (offset > -11) ao12 = a + (posX + 11) * 2 + posY * lda; else ao12 = a + posY * 2 + (posX + 11) * lda;
if (offset > -12) ao13 = a + (posX + 12) * 2 + posY * lda; else ao13 = a + posY * 2 + (posX + 12) * lda;
if (offset > -13) ao14 = a + (posX + 13) * 2 + posY * lda; else ao14 = a + posY * 2 + (posX + 13) * lda;
if (offset > -14) ao15 = a + (posX + 14) * 2 + posY * lda; else ao15 = a + posY * 2 + (posX + 14) * lda;
if (offset > -15) ao16 = a + (posX + 15) * 2 + posY * lda; else ao16 = a + posY * 2 + (posX + 15) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
data17 = *(ao9 + 0);
data18 = *(ao9 + 1);
data19 = *(ao10 + 0);
data20 = *(ao10 + 1);
data21 = *(ao11 + 0);
data22 = *(ao11 + 1);
data23 = *(ao12 + 0);
data24 = *(ao12 + 1);
data25 = *(ao13 + 0);
data26 = *(ao13 + 1);
data27 = *(ao14 + 0);
data28 = *(ao14 + 1);
data29 = *(ao15 + 0);
data30 = *(ao15 + 1);
data31 = *(ao16 + 0);
data32 = *(ao16 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
if (offset > -8) ao9 += lda; else ao9 += 2;
if (offset > -9) ao10 += lda; else ao10 += 2;
if (offset > -10) ao11 += lda; else ao11 += 2;
if (offset > -11) ao12 += lda; else ao12 += 2;
if (offset > -12) ao13 += lda; else ao13 += 2;
if (offset > -13) ao14 += lda; else ao14 += 2;
if (offset > -14) ao15 += lda; else ao15 += 2;
if (offset > -15) ao16 += lda; else ao16 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b[16] = data17;
b[17] = data18;
b[18] = data19;
b[19] = data20;
b[20] = data21;
b[21] = data22;
b[22] = data23;
b[23] = data24;
b[24] = data25;
b[25] = data26;
b[26] = data27;
b[27] = data28;
b[28] = data29;
b[29] = data30;
b[30] = data31;
b[31] = data32;
b += 32;
offset --;
i --;
}
posX += 16;
js --;
}
if (n & 8) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
if (offset > -4) ao5 = a + (posX + 4) * 2 + posY * lda; else ao5 = a + posY * 2 + (posX + 4) * lda;
if (offset > -5) ao6 = a + (posX + 5) * 2 + posY * lda; else ao6 = a + posY * 2 + (posX + 5) * lda;
if (offset > -6) ao7 = a + (posX + 6) * 2 + posY * lda; else ao7 = a + posY * 2 + (posX + 6) * lda;
if (offset > -7) ao8 = a + (posX + 7) * 2 + posY * lda; else ao8 = a + posY * 2 + (posX + 7) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
if (offset > -4) ao5 += lda; else ao5 += 2;
if (offset > -5) ao6 += lda; else ao6 += 2;
if (offset > -6) ao7 += lda; else ao7 += 2;
if (offset > -7) ao8 += lda; else ao8 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b += 16;
offset --;
i --;
}
posX += 8;
}
if (n & 4) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
if (offset > -2) ao3 = a + (posX + 2) * 2 + posY * lda; else ao3 = a + posY * 2 + (posX + 2) * lda;
if (offset > -3) ao4 = a + (posX + 3) * 2 + posY * lda; else ao4 = a + posY * 2 + (posX + 3) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
if (offset > -2) ao3 += lda; else ao3 += 2;
if (offset > -3) ao4 += lda; else ao4 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b += 8;
offset --;
i --;
}
posX += 4;
}
if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
if (offset > -1) ao2 = a + (posX + 1) * 2 + posY * lda; else ao2 = a + posY * 2 + (posX + 1) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
if (offset > -1) ao2 += lda; else ao2 += 2;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
offset --;
i --;
}
posX += 2;
}
if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + (posX + 0) * 2 + posY * lda; else ao1 = a + posY * 2 + (posX + 0) * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
if (offset > 0) ao1 += lda; else ao1 += 2;
b[ 0] = data01;
b[ 1] = data02;
b += 2;
offset --;
i --;
}
}
return 0;
}

View File

@ -0,0 +1,332 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, FLOAT *b){
BLASLONG i, js, offset;
FLOAT data01, data02, data03, data04, data05, data06, data07, data08;
FLOAT data09, data10, data11, data12, data13, data14, data15, data16;
FLOAT data17, data18, data19, data20, data21, data22, data23, data24;
FLOAT data25, data26, data27, data28, data29, data30, data31, data32;
FLOAT *ao1, *ao2, *ao3, *ao4, *ao5, *ao6, *ao7, *ao8;
FLOAT *ao9, *ao10, *ao11, *ao12, *ao13, *ao14, *ao15, *ao16;
lda *= 2;
js = (n >> 4);
while (js > 0){
offset = posX - posY;
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
if (offset > -8) ao9 = a + posY * 2 + (posX + 8) * lda; else ao9 = a + (posX + 8) * 2 + posY * lda;
if (offset > -9) ao10 = a + posY * 2 + (posX + 9) * lda; else ao10 = a + (posX + 9) * 2 + posY * lda;
if (offset > -10) ao11 = a + posY * 2 + (posX + 10) * lda; else ao11 = a + (posX + 10) * 2 + posY * lda;
if (offset > -11) ao12 = a + posY * 2 + (posX + 11) * lda; else ao12 = a + (posX + 11) * 2 + posY * lda;
if (offset > -12) ao13 = a + posY * 2 + (posX + 12) * lda; else ao13 = a + (posX + 12) * 2 + posY * lda;
if (offset > -13) ao14 = a + posY * 2 + (posX + 13) * lda; else ao14 = a + (posX + 13) * 2 + posY * lda;
if (offset > -14) ao15 = a + posY * 2 + (posX + 14) * lda; else ao15 = a + (posX + 14) * 2 + posY * lda;
if (offset > -15) ao16 = a + posY * 2 + (posX + 15) * lda; else ao16 = a + (posX + 15) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
data17 = *(ao9 + 0);
data18 = *(ao9 + 1);
data19 = *(ao10 + 0);
data20 = *(ao10 + 1);
data21 = *(ao11 + 0);
data22 = *(ao11 + 1);
data23 = *(ao12 + 0);
data24 = *(ao12 + 1);
data25 = *(ao13 + 0);
data26 = *(ao13 + 1);
data27 = *(ao14 + 0);
data28 = *(ao14 + 1);
data29 = *(ao15 + 0);
data30 = *(ao15 + 1);
data31 = *(ao16 + 0);
data32 = *(ao16 + 1);
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
if (offset > -3) ao4 += 2; else ao4 += lda;
if (offset > -4) ao5 += 2; else ao5 += lda;
if (offset > -5) ao6 += 2; else ao6 += lda;
if (offset > -6) ao7 += 2; else ao7 += lda;
if (offset > -7) ao8 += 2; else ao8 += lda;
if (offset > -8) ao9 += 2; else ao9 += lda;
if (offset > -9) ao10 += 2; else ao10 += lda;
if (offset > -10) ao11 += 2; else ao11 += lda;
if (offset > -11) ao12 += 2; else ao12 += lda;
if (offset > -12) ao13 += 2; else ao13 += lda;
if (offset > -13) ao14 += 2; else ao14 += lda;
if (offset > -14) ao15 += 2; else ao15 += lda;
if (offset > -15) ao16 += 2; else ao16 += lda;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b[16] = data17;
b[17] = data18;
b[18] = data19;
b[19] = data20;
b[20] = data21;
b[21] = data22;
b[22] = data23;
b[23] = data24;
b[24] = data25;
b[25] = data26;
b[26] = data27;
b[27] = data28;
b[28] = data29;
b[29] = data30;
b[30] = data31;
b[31] = data32;
b += 32;
offset --;
i --;
}
posX += 16;
js --;
}
if (n & 8) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
if (offset > -4) ao5 = a + posY * 2 + (posX + 4) * lda; else ao5 = a + (posX + 4) * 2 + posY * lda;
if (offset > -5) ao6 = a + posY * 2 + (posX + 5) * lda; else ao6 = a + (posX + 5) * 2 + posY * lda;
if (offset > -6) ao7 = a + posY * 2 + (posX + 6) * lda; else ao7 = a + (posX + 6) * 2 + posY * lda;
if (offset > -7) ao8 = a + posY * 2 + (posX + 7) * lda; else ao8 = a + (posX + 7) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
data09 = *(ao5 + 0);
data10 = *(ao5 + 1);
data11 = *(ao6 + 0);
data12 = *(ao6 + 1);
data13 = *(ao7 + 0);
data14 = *(ao7 + 1);
data15 = *(ao8 + 0);
data16 = *(ao8 + 1);
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
if (offset > -3) ao4 += 2; else ao4 += lda;
if (offset > -4) ao5 += 2; else ao5 += lda;
if (offset > -5) ao6 += 2; else ao6 += lda;
if (offset > -6) ao7 += 2; else ao7 += lda;
if (offset > -7) ao8 += 2; else ao8 += lda;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b[ 8] = data09;
b[ 9] = data10;
b[10] = data11;
b[11] = data12;
b[12] = data13;
b[13] = data14;
b[14] = data15;
b[15] = data16;
b += 16;
offset --;
i --;
}
posX += 8;
}
if (n & 4) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
if (offset > -2) ao3 = a + posY * 2 + (posX + 2) * lda; else ao3 = a + (posX + 2) * 2 + posY * lda;
if (offset > -3) ao4 = a + posY * 2 + (posX + 3) * lda; else ao4 = a + (posX + 3) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
data05 = *(ao3 + 0);
data06 = *(ao3 + 1);
data07 = *(ao4 + 0);
data08 = *(ao4 + 1);
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
if (offset > -2) ao3 += 2; else ao3 += lda;
if (offset > -3) ao4 += 2; else ao4 += lda;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b[ 4] = data05;
b[ 5] = data06;
b[ 6] = data07;
b[ 7] = data08;
b += 8;
offset --;
i --;
}
posX += 4;
}
if (n & 2) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
if (offset > -1) ao2 = a + posY * 2 + (posX + 1) * lda; else ao2 = a + (posX + 1) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
data03 = *(ao2 + 0);
data04 = *(ao2 + 1);
if (offset > 0) ao1 += 2; else ao1 += lda;
if (offset > -1) ao2 += 2; else ao2 += lda;
b[ 0] = data01;
b[ 1] = data02;
b[ 2] = data03;
b[ 3] = data04;
b += 4;
offset --;
i --;
}
posX += 2;
}
if (n & 1) {
offset = posX - posY;
if (offset > 0) ao1 = a + posY * 2 + (posX + 0) * lda; else ao1 = a + (posX + 0) * 2 + posY * lda;
i = m;
while (i > 0) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
if (offset > 0) ao1 += 2; else ao1 += lda;
b[ 0] = data01;
b[ 1] = data02;
b += 2;
offset --;
i --;
}
}
return 0;
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,308 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, j, jj, k;
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
FLOAT data1, data2;
lda *= 2;
jj = offset;
j = (n >> 4);
while (j > 0){
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a5 = a + 4 * lda;
a6 = a + 5 * lda;
a7 = a + 6 * lda;
a8 = a + 7 * lda;
a9 = a + 8 * lda;
a10 = a + 9 * lda;
a11 = a + 10 * lda;
a12 = a + 11 * lda;
a13 = a + 12 * lda;
a14 = a + 13 * lda;
a15 = a + 14 * lda;
a16 = a + 15 * lda;
a += 16 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 16)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 16) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
*(b + 8) = *(a5 + 0);
*(b + 9) = *(a5 + 1);
*(b + 10) = *(a6 + 0);
*(b + 11) = *(a6 + 1);
*(b + 12) = *(a7 + 0);
*(b + 13) = *(a7 + 1);
*(b + 14) = *(a8 + 0);
*(b + 15) = *(a8 + 1);
*(b + 16) = *(a9 + 0);
*(b + 17) = *(a9 + 1);
*(b + 18) = *(a10 + 0);
*(b + 19) = *(a10 + 1);
*(b + 20) = *(a11 + 0);
*(b + 21) = *(a11 + 1);
*(b + 22) = *(a12 + 0);
*(b + 23) = *(a12 + 1);
*(b + 24) = *(a13 + 0);
*(b + 25) = *(a13 + 1);
*(b + 26) = *(a14 + 0);
*(b + 27) = *(a14 + 1);
*(b + 28) = *(a15 + 0);
*(b + 29) = *(a15 + 1);
*(b + 30) = *(a16 + 0);
*(b + 31) = *(a16 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
a5 += 2;
a6 += 2;
a7 += 2;
a8 += 2;
a9 += 2;
a10 += 2;
a11 += 2;
a12 += 2;
a13 += 2;
a14 += 2;
a15 += 2;
a16 += 2;
b += 32;
ii ++;
}
jj += 16;
j --;
}
if (n & 8) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a5 = a + 4 * lda;
a6 = a + 5 * lda;
a7 = a + 6 * lda;
a8 = a + 7 * lda;
a += 8 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
*(b + 8) = *(a5 + 0);
*(b + 9) = *(a5 + 1);
*(b + 10) = *(a6 + 0);
*(b + 11) = *(a6 + 1);
*(b + 12) = *(a7 + 0);
*(b + 13) = *(a7 + 1);
*(b + 14) = *(a8 + 0);
*(b + 15) = *(a8 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
a5 += 2;
a6 += 2;
a7 += 2;
a8 += 2;
b += 16;
ii ++;
}
jj += 8;
}
if (n & 4) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a += 4 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
b += 8;
ii ++;
}
jj += 4;
}
if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a += 2 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
}
a1 += 2;
a2 += 2;
b += 4;
ii ++;
}
jj += 2;
}
if (n & 1) {
a1 = a + 0 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
}
a1 += 2;
b += 2;
ii ++;
}
}
return 0;
}

View File

@ -0,0 +1,264 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, j, jj, k;
FLOAT *a1;
FLOAT data1, data2;
lda *= 2;
jj = offset;
j = (n >> 4);
while (j > 0){
a1 = a;
a += 32;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 16)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 16; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
*(b + 8) = *(a1 + 8);
*(b + 9) = *(a1 + 9);
*(b + 10) = *(a1 + 10);
*(b + 11) = *(a1 + 11);
*(b + 12) = *(a1 + 12);
*(b + 13) = *(a1 + 13);
*(b + 14) = *(a1 + 14);
*(b + 15) = *(a1 + 15);
*(b + 16) = *(a1 + 16);
*(b + 17) = *(a1 + 17);
*(b + 18) = *(a1 + 18);
*(b + 19) = *(a1 + 19);
*(b + 20) = *(a1 + 20);
*(b + 21) = *(a1 + 21);
*(b + 22) = *(a1 + 22);
*(b + 23) = *(a1 + 23);
*(b + 24) = *(a1 + 24);
*(b + 25) = *(a1 + 25);
*(b + 26) = *(a1 + 26);
*(b + 27) = *(a1 + 27);
*(b + 28) = *(a1 + 28);
*(b + 29) = *(a1 + 29);
*(b + 30) = *(a1 + 30);
*(b + 31) = *(a1 + 31);
}
b += 32;
a1 += lda;
ii ++;
}
jj += 16;
j --;
}
j = (n & 8);
if (j > 0) {
a1 = a;
a += 16;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 8)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 8; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
*(b + 8) = *(a1 + 8);
*(b + 9) = *(a1 + 9);
*(b + 10) = *(a1 + 10);
*(b + 11) = *(a1 + 11);
*(b + 12) = *(a1 + 12);
*(b + 13) = *(a1 + 13);
*(b + 14) = *(a1 + 14);
*(b + 15) = *(a1 + 15);
}
b += 16;
a1 += lda;
ii ++;
}
jj += 8;
}
j = (n & 4);
if (j > 0) {
a1 = a;
a += 8;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 4)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 4; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
}
b += 8;
a1 += lda;
ii ++;
}
jj += 4;
}
j = (n & 2);
if (j > 0) {
a1 = a;
a += 4;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 2)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 2; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
}
b += 4;
a1 += lda;
ii ++;
}
jj += 2;
}
j = (n & 1);
if (j > 0) {
a1 = a;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 1)) {
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
}
b += 2;
a1 += lda;
ii ++;
}
}
return 0;
}

View File

@ -0,0 +1,313 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, j, jj, k;
FLOAT *a1, *a2, *a3, *a4, *a5, *a6, *a7, *a8;
FLOAT *a9, *a10, *a11, *a12, *a13, *a14, *a15, *a16;
FLOAT data1, data2;
lda *= 2;
jj = offset;
j = (n >> 4);
while (j > 0){
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a5 = a + 4 * lda;
a6 = a + 5 * lda;
a7 = a + 6 * lda;
a8 = a + 7 * lda;
a9 = a + 8 * lda;
a10 = a + 9 * lda;
a11 = a + 10 * lda;
a12 = a + 11 * lda;
a13 = a + 12 * lda;
a14 = a + 13 * lda;
a15 = a + 14 * lda;
a16 = a + 15 * lda;
a += 16 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 16)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 16; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
*(b + 8) = *(a5 + 0);
*(b + 9) = *(a5 + 1);
*(b + 10) = *(a6 + 0);
*(b + 11) = *(a6 + 1);
*(b + 12) = *(a7 + 0);
*(b + 13) = *(a7 + 1);
*(b + 14) = *(a8 + 0);
*(b + 15) = *(a8 + 1);
*(b + 16) = *(a9 + 0);
*(b + 17) = *(a9 + 1);
*(b + 18) = *(a10 + 0);
*(b + 19) = *(a10 + 1);
*(b + 20) = *(a11 + 0);
*(b + 21) = *(a11 + 1);
*(b + 22) = *(a12 + 0);
*(b + 23) = *(a12 + 1);
*(b + 24) = *(a13 + 0);
*(b + 25) = *(a13 + 1);
*(b + 26) = *(a14 + 0);
*(b + 27) = *(a14 + 1);
*(b + 28) = *(a15 + 0);
*(b + 29) = *(a15 + 1);
*(b + 30) = *(a16 + 0);
*(b + 31) = *(a16 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
a5 += 2;
a6 += 2;
a7 += 2;
a8 += 2;
a9 += 2;
a10 += 2;
a11 += 2;
a12 += 2;
a13 += 2;
a14 += 2;
a15 += 2;
a16 += 2;
b += 32;
ii ++;
}
jj += 16;
j --;
}
if (n & 8) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a5 = a + 4 * lda;
a6 = a + 5 * lda;
a7 = a + 6 * lda;
a8 = a + 7 * lda;
a += 8 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 8)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 8; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
*(b + 8) = *(a5 + 0);
*(b + 9) = *(a5 + 1);
*(b + 10) = *(a6 + 0);
*(b + 11) = *(a6 + 1);
*(b + 12) = *(a7 + 0);
*(b + 13) = *(a7 + 1);
*(b + 14) = *(a8 + 0);
*(b + 15) = *(a8 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
a5 += 2;
a6 += 2;
a7 += 2;
a8 += 2;
b += 16;
ii ++;
}
jj += 8;
}
if (n & 4) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a3 = a + 2 * lda;
a4 = a + 3 * lda;
a += 4 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 4)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 4; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
*(b + 4) = *(a3 + 0);
*(b + 5) = *(a3 + 1);
*(b + 6) = *(a4 + 0);
*(b + 7) = *(a4 + 1);
}
a1 += 2;
a2 += 2;
a3 += 2;
a4 += 2;
b += 8;
ii ++;
}
jj += 4;
}
if (n & 2) {
a1 = a + 0 * lda;
a2 = a + 1 * lda;
a += 2 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 2)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 2; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a2 + 0);
*(b + 3) = *(a2 + 1);
}
a1 += 2;
a2 += 2;
b += 4;
ii ++;
}
jj += 2;
}
if (n & 1) {
a1 = a + 0 * lda;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 1)) {
data1 = *(a1 + (ii - jj) * lda + 0);
data2 = *(a1 + (ii - jj) * lda + 1);
compinv(b + (ii - jj) * 2, data1, data2);
for (k = ii - jj + 1; k < 1; k ++) {
*(b + k * 2 + 0) = *(a1 + k * lda + 0);
*(b + k * 2 + 1) = *(a1 + k * lda + 1);
}
}
if (ii - jj < 0) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
}
a1 += 2;
b += 2;
ii ++;
}
}
return 0;
}

View File

@ -0,0 +1,261 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include <stdio.h>
#include "common.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT *b){
BLASLONG i, ii, j, jj, k;
FLOAT *a1, data1, data2;
lda *= 2;
jj = offset;
j = (n >> 4);
while (j > 0){
a1 = a;
a += 32;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 16)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 16) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
*(b + 8) = *(a1 + 8);
*(b + 9) = *(a1 + 9);
*(b + 10) = *(a1 + 10);
*(b + 11) = *(a1 + 11);
*(b + 12) = *(a1 + 12);
*(b + 13) = *(a1 + 13);
*(b + 14) = *(a1 + 14);
*(b + 15) = *(a1 + 15);
*(b + 16) = *(a1 + 16);
*(b + 17) = *(a1 + 17);
*(b + 18) = *(a1 + 18);
*(b + 19) = *(a1 + 19);
*(b + 20) = *(a1 + 20);
*(b + 21) = *(a1 + 21);
*(b + 22) = *(a1 + 22);
*(b + 23) = *(a1 + 23);
*(b + 24) = *(a1 + 24);
*(b + 25) = *(a1 + 25);
*(b + 26) = *(a1 + 26);
*(b + 27) = *(a1 + 27);
*(b + 28) = *(a1 + 28);
*(b + 29) = *(a1 + 29);
*(b + 30) = *(a1 + 30);
*(b + 31) = *(a1 + 31);
}
b += 32;
a1 += lda;
ii ++;
}
jj += 16;
j --;
}
j = (n & 8);
if (j > 0) {
a1 = a;
a += 16;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 8)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 8) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
*(b + 8) = *(a1 + 8);
*(b + 9) = *(a1 + 9);
*(b + 10) = *(a1 + 10);
*(b + 11) = *(a1 + 11);
*(b + 12) = *(a1 + 12);
*(b + 13) = *(a1 + 13);
*(b + 14) = *(a1 + 14);
*(b + 15) = *(a1 + 15);
}
b += 16;
a1 += lda;
ii ++;
}
jj += 8;
}
j = (n & 4);
if (j > 0) {
a1 = a;
a += 8;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 4)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 4) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
*(b + 4) = *(a1 + 4);
*(b + 5) = *(a1 + 5);
*(b + 6) = *(a1 + 6);
*(b + 7) = *(a1 + 7);
}
b += 8;
a1 += lda;
ii ++;
}
jj += 4;
}
j = (n & 2);
if (j > 0) {
a1 = a;
a += 4;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 2)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 2) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
*(b + 2) = *(a1 + 2);
*(b + 3) = *(a1 + 3);
}
b += 4;
a1 += lda;
ii ++;
}
jj += 2;
}
j = (n & 1);
if (j > 0) {
a1 = a;
ii = 0;
for (i = 0; i < m; i++) {
if ((ii >= jj ) && (ii - jj < 1)) {
for (k = 0; k < ii - jj; k ++) {
*(b + k * 2 + 0) = *(a1 + k * 2 + 0);
*(b + k * 2 + 1) = *(a1 + k * 2 + 1);
}
data1 = *(a1 + (ii - jj) * 2 + 0);
data2 = *(a1 + (ii - jj) * 2 + 1);
compinv(b + (ii - jj) * 2, data1, data2);
}
if (ii - jj >= 1) {
*(b + 0) = *(a1 + 0);
*(b + 1) = *(a1 + 1);
}
b += 2;
a1 += lda;
ii ++;
}
}
return 0;
}

View File

@ -111,12 +111,19 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
SGEMVNKERNEL = sgemv_n_8_lasx.S SGEMVNKERNEL = sgemv_n_8_lasx.S
SGEMVTKERNEL = sgemv_t_8_lasx.S SGEMVTKERNEL = sgemv_t_8_lasx.S
CGEMMKERNEL = cgemm_kernel_2x2_lsx.S CGEMMKERNEL = cgemm_kernel_16x4_lasx.S
CGEMMONCOPY = cgemm_ncopy_2_lsx.S CGEMMINCOPY = cgemm_ncopy_16_lasx.S
CGEMMOTCOPY = cgemm_tcopy_2_lsx.S CGEMMITCOPY = cgemm_tcopy_16_lasx.S
CGEMMONCOPY = cgemm_ncopy_4_lasx.S
CGEMMOTCOPY = cgemm_tcopy_4_lasx.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMVNKERNEL = cgemv_n_8_lasx.S
CGEMVTKERNEL = cgemv_t_8_lasx.S
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
@ -132,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMVNKERNEL = zgemv_n_4_lasx.S
ZGEMVTKERNEL = zgemv_t_4_lasx.S
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,691 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define S9 $r20
#define S10 $r23
#define S11 $r24
#define S12 $r25
#define S13 $r26
#define S14 $r27
#define S15 $r28
#define S16 $r29
#define TD $r30
#define TS $r31
#define TL $r7
#define T0 $r6
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define U8 $xr8
#define U9 $xr9
#define U10 $xr10
#define U11 $xr11
#define U12 $xr12
#define U13 $xr13
#define U14 $xr14
#define U15 $xr15
#define D0 $xr16
#define D1 $xr17
#define D2 $xr18
#define D3 $xr19
#define D4 $xr20
#define D5 $xr21
#define D6 $xr22
#define D7 $xr23
#define D8 $xr24
#define D9 $xr25
#define D10 $xr26
#define D11 $xr27
#define D12 $xr28
#define D13 $xr29
#define D14 $xr30
#define D15 $xr31
PROLOGUE
addi.d $sp, $sp, -0x90
SDARG $r23, $sp, 0x00
SDARG $r24, $sp, 0x08
SDARG $r25, $sp, 0x10
SDARG $r26, $sp, 0x18
SDARG $r27, $sp, 0x20
SDARG $r28, $sp, 0x28
SDARG $r29, $sp, 0x30
SDARG $r30, $sp, 0x38
SDARG $r31, $sp, 0x40
ST $f23, $sp, 0x48
ST $f24, $sp, 0x50
ST $f25, $sp, 0x58
ST $f26, $sp, 0x60
ST $f27, $sp, 0x68
ST $f28, $sp, 0x70
ST $f29, $sp, 0x78
ST $f30, $sp, 0x80
ST $f31, $sp, 0x88
move TD, DST
move TS, SRC
slli.d TL, LDA, 0x03
slli.d T0, TL, 0x01
srai.d J, N, 0x04
beq J, ZERO, .L_N8
.L_J1: /* J-- */
move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x03
add.d S3, S2, TL
addi.d J, J, -1
add.d S4, S3, TL
add.d S5, S3, T0
add.d S6, S4, T0
add.d S7, S5, T0
add.d S8, S6, T0
add.d S9, S7, T0
add.d S10, S8, T0
add.d S11, S9, T0
add.d S12, S10, T0
add.d S13, S11, T0
add.d S14, S12, T0
add.d S15, S13, T0
add.d S16, S14, T0
add.d TS, S15, T0
beq I, ZERO, .L_I7
.L_I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvld U8, S9, 0x00
xvld U9, S10, 0x00
xvld U10, S11, 0x00
xvld U11, S12, 0x00
xvld U12, S13, 0x00
xvld U13, S14, 0x00
xvld U14, S15, 0x00
xvld U15, S16, 0x00
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6
xvpackev.d D8, U9, U8
xvpackod.d D9, U9, U8
xvpackev.d D10, U11, U10
xvpackod.d D11, U11, U10
xvpackev.d D12, U13, U12
xvpackod.d D13, U13, U12
xvpackev.d D14, U15, U14
xvpackod.d D15, U15, U14
xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 4
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 5
xvpermi.q D2, U0, 0x31 // 8
xvpermi.q D6, U4, 0x31 // 9
xvpermi.q D3, U1, 0x31 // 12
xvpermi.q D7, U5, 0x31 // 13
xvand.v U8, D8, D8
xvpermi.q D8, D10, 0x02 // 2
xvand.v U12, D12, D12
xvpermi.q D12, D14, 0x02 // 3
xvand.v U9, D9, D9
xvpermi.q D9, D11, 0x02 // 6
xvand.v U13, D13, D13
xvpermi.q D13, D15, 0x02 // 7
xvpermi.q D10, U8, 0x31 // 10
xvpermi.q D14, U12, 0x31 // 11
xvpermi.q D11, U9, 0x31 // 14
xvpermi.q D15, U13, 0x31 // 15
xvst D0, TD, 0x00 // 0
xvst D4, TD, 0x20 // 1
xvst D8, TD, 0x40 // 2
xvst D12, TD, 0x60 // 3
xvst D1, TD, 0x80 // 4
xvst D5, TD, 0xA0 // 5
xvst D9, TD, 0xC0 // 6
xvst D13, TD, 0xE0 // 7
addi.d TD, TD, 0x100
xvst D2, TD, 0x00 // 8
xvst D6, TD, 0x20 // 9
xvst D10, TD, 0x40 // 10
xvst D14, TD, 0x60 // 11
xvst D3, TD, 0x80 // 12
xvst D7, TD, 0xA0 // 13
xvst D11, TD, 0xC0 // 14
xvst D15, TD, 0xE0 // 15
addi.d TD, TD, 0x100
xvld U0, S1, 0x20
xvld U1, S2, 0x20
xvld U2, S3, 0x20
xvld U3, S4, 0x20
xvld U4, S5, 0x20
xvld U5, S6, 0x20
xvld U6, S7, 0x20
xvld U7, S8, 0x20
xvld U8, S9, 0x20
xvld U9, S10, 0x20
xvld U10, S11, 0x20
xvld U11, S12, 0x20
xvld U12, S13, 0x20
xvld U13, S14, 0x20
xvld U14, S15, 0x20
xvld U15, S16, 0x20
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6
xvpackev.d D8, U9, U8
xvpackod.d D9, U9, U8
xvpackev.d D10, U11, U10
xvpackod.d D11, U11, U10
xvpackev.d D12, U13, U12
xvpackod.d D13, U13, U12
xvpackev.d D14, U15, U14
xvpackod.d D15, U15, U14
xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 4
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 5
xvpermi.q D2, U0, 0x31 // 8
xvpermi.q D6, U4, 0x31 // 9
xvpermi.q D3, U1, 0x31 // 12
xvpermi.q D7, U5, 0x31 // 13
xvand.v U8, D8, D8
xvpermi.q D8, D10, 0x02 // 2
xvand.v U12, D12, D12
xvpermi.q D12, D14, 0x02 // 3
xvand.v U9, D9, D9
xvpermi.q D9, D11, 0x02 // 6
xvand.v U13, D13, D13
xvpermi.q D13, D15, 0x02 // 7
xvpermi.q D10, U8, 0x31 // 10
xvpermi.q D14, U12, 0x31 // 11
xvpermi.q D11, U9, 0x31 // 14
xvpermi.q D15, U13, 0x31 // 15
xvst D0, TD, 0x00 // 0
xvst D4, TD, 0x20 // 1
xvst D8, TD, 0x40 // 2
xvst D12, TD, 0x60 // 3
xvst D1, TD, 0x80 // 4
xvst D5, TD, 0xA0 // 5
xvst D9, TD, 0xC0 // 6
xvst D13, TD, 0xE0 // 7
addi.d TD, TD, 0x100
xvst D2, TD, 0x00 // 8
xvst D6, TD, 0x20 // 9
xvst D10, TD, 0x40 // 10
xvst D14, TD, 0x60 // 11
xvst D3, TD, 0x80 // 12
xvst D7, TD, 0xA0 // 13
xvst D11, TD, 0xC0 // 14
xvst D15, TD, 0xE0 // 15
addi.d TD, TD, 0x100
addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d S5, S5, 0x40
addi.d S6, S6, 0x40
addi.d S7, S7, 0x40
addi.d S8, S8, 0x40
addi.d S9, S9, 0x40
addi.d S10, S10, 0x40
addi.d S11, S11, 0x40
addi.d S12, S12, 0x40
addi.d S13, S13, 0x40
addi.d S14, S14, 0x40
addi.d S15, S15, 0x40
addi.d S16, S16, 0x40
addi.d I, I, -1
blt ZERO, I, .L_I1
.L_I7:
andi I, M, 0x07
beq I, ZERO, .L_I0
.L_II1: /* I-- */
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00
fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08
fst.d F4, TD, 0x20
addi.d S5, S5, 0x08
fst.d F5, TD, 0x28
addi.d S6, S6, 0x08
fst.d F6, TD, 0x30
addi.d S7, S7, 0x08
fst.d F7, TD, 0x38
addi.d S8, S8, 0x08
addi.d TD, TD, 0x40
fld.d F0, S9, 0x00
fld.d F1, S10, 0x00
fld.d F2, S11, 0x00
fld.d F3, S12, 0x00
fld.d F4, S13, 0x00
fld.d F5, S14, 0x00
fld.d F6, S15, 0x00
fld.d F7, S16, 0x00
fst.d F0, TD, 0x00
addi.d S9, S9, 0x08
fst.d F1, TD, 0x08
addi.d S10, S10, 0x08
fst.d F2, TD, 0x10
addi.d S11, S11, 0x08
fst.d F3, TD, 0x18
addi.d S12, S12, 0x08
fst.d F4, TD, 0x20
addi.d S13, S13, 0x08
fst.d F5, TD, 0x28
addi.d S14, S14, 0x08
fst.d F6, TD, 0x30
addi.d S15, S15, 0x08
fst.d F7, TD, 0x38
addi.d S16, S16, 0x08
addi.d TD, TD, 0x40
addi.d I, I, -1
blt ZERO, I, .L_II1
.L_I0:
blt ZERO, J, .L_J1
.L_N8:
andi J, N, 0x08
beq ZERO, J, .L_N4
move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x03
add.d S3, S2, TL
add.d S4, S2, T0
add.d S5, S3, T0
add.d S6, S4, T0
add.d S7, S5, T0
add.d S8, S6, T0
add.d TS, S7, T0
beq I, ZERO, .L_8I3
.L_8I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6
xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 2
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 3
xvpermi.q D2, U0, 0x31 // 4
xvpermi.q D6, U4, 0x31 // 5
xvpermi.q D3, U1, 0x31 // 6
xvpermi.q D7, U5, 0x31 // 7
xvst D0, TD, 0x00
xvst D4, TD, 0x20
xvst D1, TD, 0x40
xvst D5, TD, 0x60
xvst D2, TD, 0x80
xvst D6, TD, 0xA0
xvst D3, TD, 0xC0
xvst D7, TD, 0xE0
addi.d TD, TD, 0x100
xvld U0, S1, 0x20
xvld U1, S2, 0x20
xvld U2, S3, 0x20
xvld U3, S4, 0x20
xvld U4, S5, 0x20
xvld U5, S6, 0x20
xvld U6, S7, 0x20
xvld U7, S8, 0x20
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvpackev.d D4, U5, U4
xvpackod.d D5, U5, U4
xvpackev.d D6, U7, U6
xvpackod.d D7, U7, U6
xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U4, D4, D4
xvpermi.q D4, D6, 0x02 // 1
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 2
xvand.v U5, D5, D5
xvpermi.q D5, D7, 0x02 // 3
xvpermi.q D2, U0, 0x31 // 4
xvpermi.q D6, U4, 0x31 // 5
xvpermi.q D3, U1, 0x31 // 6
xvpermi.q D7, U5, 0x31 // 7
xvst D0, TD, 0x00
xvst D4, TD, 0x20
xvst D1, TD, 0x40
xvst D5, TD, 0x60
xvst D2, TD, 0x80
xvst D6, TD, 0xA0
xvst D3, TD, 0xC0
xvst D7, TD, 0xE0
addi.d TD, TD, 0x100
addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d S5, S5, 0x40
addi.d S6, S6, 0x40
addi.d S7, S7, 0x40
addi.d S8, S8, 0x40
addi.d I, I, -1
blt ZERO, I, .L_8I1
.L_8I3:
andi I, M, 0x07
beq I, ZERO, .L_N4
.L_8I11:
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fld.d F4, S5, 0x00
fld.d F5, S6, 0x00
fld.d F6, S7, 0x00
fld.d F7, S8, 0x00
fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08
fst.d F4, TD, 0x20
addi.d S5, S5, 0x08
fst.d F5, TD, 0x28
addi.d S6, S6, 0x08
fst.d F6, TD, 0x30
addi.d S7, S7, 0x08
fst.d F7, TD, 0x38
addi.d S8, S8, 0x08
addi.d TD, TD, 0x40
addi.d I, I, -1
blt ZERO, I, .L_8I11
.L_N4:
andi J, N, 0x04
beq ZERO, J, .L_N2
move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x02
add.d S3, S2, TL
add.d S4, S2, T0
add.d TS, S3, T0
beq I, ZERO, .L_I3
.L_4I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpackev.d D2, U3, U2
xvpackod.d D3, U3, U2
xvand.v U0, D0, D0
xvpermi.q D0, D2, 0x02 // 0
xvand.v U1, D1, D1
xvpermi.q D1, D3, 0x02 // 1
xvpermi.q D2, U0, 0x31 // 2
xvpermi.q D3, U1, 0x31 // 3
xvst D0, TD, 0x00
xvst D1, TD, 0x20
xvst D2, TD, 0x40
xvst D3, TD, 0x60
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
addi.d TD, TD, 0x80
addi.d I, I, -1
blt ZERO, I, .L_4I1
.L_I3:
andi I, M, 0x03
beq I, ZERO, .L_N2
.L_4II1:
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
fst.d F2, TD, 0x10
addi.d S3, S3, 0x08
fst.d F3, TD, 0x18
addi.d S4, S4, 0x08
addi.d TD, TD, 0x20
addi.d I, I, -1
blt ZERO, I, .L_4II1
.L_N2:
andi J, N, 0x02
beq ZERO, J, .L_N1
move S1, TS
add.d S2, TS, TL
srai.d I, M, 0x01
add.d TS, S2, TL
beq I, ZERO, .L_NI1
.L_2I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvpackev.d D0, U1, U0
xvpackod.d D1, U1, U0
xvpermi.q D0, D1, 0x02 // 0
xvst D0, TD, 0x00
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d TD, TD, 0x20
addi.d I, I, -1
blt ZERO, I, .L_2I1
.L_NI1:
andi I, M, 0x01
beq I, ZERO, .L_N1
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fst.d F0, TD, 0x00
addi.d S1, S1, 0x08
fst.d F1, TD, 0x08
addi.d S2, S2, 0x08
addi.d TD, TD, 0x10
.L_N1:
move S1, TS
beq ZERO, M, .L_N0
.L_M1:
fld.d F0, S1, 0x00
addi.d S1, S1, 0x08
fst.d F0, TD, 0x00
addi.d TD, TD, 0x08
addi.d M, M, -1
blt ZERO, M, .L_M1
.L_N0:
LDARG $r23, $sp, 0x00
LDARG $r24, $sp, 0x08
LDARG $r25, $sp, 0x10
LDARG $r26, $sp, 0x18
LDARG $r27, $sp, 0x20
LDARG $r28, $sp, 0x28
LDARG $r29, $sp, 0x30
LDARG $r30, $sp, 0x38
LDARG $r31, $sp, 0x40
LD $f23, $sp, 0x48
LD $f24, $sp, 0x50
LD $f25, $sp, 0x58
LD $f26, $sp, 0x60
LD $f27, $sp, 0x68
LD $f28, $sp, 0x70
LD $f29, $sp, 0x78
LD $f30, $sp, 0x80
LD $f31, $sp, 0x88
addi.d $sp, $sp, 0x90
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -0,0 +1,325 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define TD $r20
#define TS $r11
#define TL $r19
#define T0 $r23
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define D0 $xr8
#define D1 $xr9
#define D2 $xr10
#define D3 $xr11
#define D4 $xr12
#define D5 $xr13
#define D6 $xr14
#define D7 $xr15
#define D8 $xr16
PROLOGUE
addi.d $sp, $sp, -8
SDARG $r23, $sp, 0
move TD, DST //boffset
move TS, SRC //aoffset
slli.d TL, LDA, 0x02
slli.d TL, TL, 0x01
srai.d J, N, 0x02
beq J, ZERO, .L_N0
.L_J1: /* J-- */
move S1, TS
add.d S2, S1, TL
add.d S3, S2, TL
add.d S4, S3, TL
slli.d T0, TL, 0x02
add.d TS, TS, T0
srai.d I, M, 0x02
beq I, ZERO, .L_I3
.L_I1: /* I-- */
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
xvld U2, S3, 0x00 //17 18 19 20 21 22 23 24
xvld U3, S4, 0x00 //25 26 27 28 29 30 31 32
xvand.v D0, U0, U0
xvand.v D1, U1, U1
xvand.v D2, U2, U2
xvand.v D3, U3, U3
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
xvshuf4i.d D2, U3, 0x88 //17 18 25 26 21 22 29 30
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
xvshuf4i.d D3, U2, 0x77 //19 20 27 28 23 24 31 32
xvand.v U4, D0, D0
xvand.v U5, D1, D1
xvpermi.q U4, D2, 0x02 //1 2 9 10 17 18 25 26
xvpermi.q U5, D3, 0x02 //3 4 11 12 19 20 27 28
xvpermi.q D2, D0, 0x31 //5 6 13 14 21 22 29 30
xvpermi.q D3, D1, 0x31 //7 8 15 16 23 24 31 32
xvst U4, TD, 0x00
xvst U5, TD, 0x20
xvst D2, TD, 0x40
xvst D3, TD, 0x60
addi.d S1, S1, 0x20 // a_offset
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
addi.d TD, TD, 0x80 // b_offset
addi.d I, I, -1
blt ZERO, I, .L_I1
.L_I3: /* if(m&2) */
andi I, M, 0x02
beq I, ZERO, .L_II20
vld $vr0, S1, 0x00
vld $vr1, S2, 0x00
vld $vr2, S3, 0x00
vld $vr3, S4, 0x00
vand.v $vr8, $vr1, $vr1
vand.v $vr9, $vr1, $vr1
vand.v $vr10, $vr3, $vr3
vand.v $vr11, $vr3, $vr3
vpermi.w $vr8, $vr0, 0x44
vpermi.w $vr10, $vr2, 0x44
vpermi.w $vr9, $vr0, 0xee
vpermi.w $vr11, $vr2, 0xee
vst $vr8, TD, 0x00
vst $vr10, TD, 0x10
vst $vr9, TD, 0x20
vst $vr11, TD, 0x30
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d S3, S3, 0x10
addi.d S4, S4, 0x10
addi.d TD, TD, 0x40
.L_II20: /* if(m&1) */
andi I, M, 0x01
beq I, ZERO, .L_J0
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fld.s F2, S2, 0x00
fld.s F3, S2, 0x04
fld.s F4, S3, 0x00
fld.s F5, S3, 0x04
fld.s F6, S4, 0x00
fld.s F7, S4, 0x04
fst.s F0, TD, 0x00
fst.s F1, TD, 0x04
fst.s F2, TD, 0x08
fst.s F3, TD, 0x0c
fst.s F4, TD, 0x10
fst.s F5, TD, 0x14
fst.s F6, TD, 0x18
fst.s F7, TD, 0x1c
addi.d TD, TD, 0x20
.L_J0:
addi.d J, J, -1
blt ZERO, J, .L_J1
.L_N0: /* if(n&2) */
andi I, N, 0x02
beq ZERO, I, .L_N20
move S1, TS
add.d S2, S1, TL
slli.d T0, TL, 0x01
add.d TS, TS, T0
srai.d I, M, 0x02
beq ZERO, I, .L_N10
.L_N11: /* if(i>0) */
xvld U0, S1, 0x00 //1 2 3 4 5 6 7 8
xvld U1, S2, 0x00 //9 10 11 12 13 14 15 16
xvand.v D0, U0, U0
xvand.v D1, U1, U1
xvshuf4i.d D0, U1, 0x88 //1 2 9 10 5 6 13 14
xvshuf4i.d D1, U0, 0x77 //3 4 11 12 7 8 15 16
xvand.v U4, D0, D0
xvpermi.q U4, D1, 0x02 //1 2 9 10 3 4 11 12
xvpermi.q D1, D0, 0x31 //5 6 13 14 7 8 15 16
xvst U4, TD, 0x00
xvst D1, TD, 0x20
addi.d S1, S1, 0x20 // a_offset
addi.d S2, S2, 0x20
addi.d TD, TD, 0x40 // b_offset
addi.d I, I, -1
blt ZERO, I, .L_N11
.L_N10: /* if(m&2) */
andi I, M, 0x02
beq I, ZERO, .L_N130
vld $vr0, S1, 0x00
vld $vr1, S2, 0x00
vand.v $vr8, $vr1, $vr1
vpermi.w $vr8, $vr0, 0x44
vpermi.w $vr1, $vr0, 0xee
vst $vr8, TD, 0x00
vst $vr1, TD, 0x10
addi.d S1, S1, 0x10 // a_offset
addi.d S2, S2, 0x10
addi.d TD, TD, 0x20 // b_offset
.L_N130: /* if(m&1) */
andi I, M, 0x01
beq I, ZERO, .L_N20
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fld.s F2, S2, 0x00
fld.s F3, S2, 0x04
fst.s F0, TD, 0x00
fst.s F1, TD, 0x04
fst.s F2, TD, 0x08
fst.s F3, TD, 0x0c
addi.d TD, TD, 0x10
.L_N20: /* if(n&1) */
andi I, N, 0x01
beq I, ZERO, .L_N00
move S1, TS
srai.d I, M, 0x02
beq I, ZERO, .L_N30
.L_N21: /* if(i>0) */
xvld U0, S1, 0x00
xvst U0, TD, 0x00
addi.d S1, S1, 0x20 // aoffset1
addi.d TD, TD, 0x20 // b_offset
addi.d I, I, -1
blt ZERO, I, .L_N21
.L_N30: /* if(m&2) */
andi I, M, 0x02
beq I, ZERO, .L_N330
vld $vr0, S1, 0x00
vst $vr0, TD, 0x00
addi.d S1, S1, 0x10 // aoffset1
addi.d TD, TD, 0x10 // b_offset
.L_N330: /* if(m&1) */
andi I, M, 0x01
beq I, ZERO, .L_N00
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fst.s F0, TD, 0x00
fst.s F1, TD, 0x04
.L_N00:
LDARG $r23, $sp, 0
addi.d $sp, $sp, 8
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -0,0 +1,741 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S0 $r11
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define S5 $r16
#define S6 $r17
#define S7 $r18
#define S8 $r19
#define P0 $r20
#define P1 $r23
#define P2 $r24
#define P3 $r25
#define P4 $r26
#define P5 $r27
#define T0 $r28
#define T1 $r29
#define TL $r7
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
#define F8 $f8
#define F9 $f9
#define F10 $f10
#define F11 $f11
#define F12 $f12
#define F13 $f13
#define F14 $f14
#define F15 $f15
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
PROLOGUE
addi.d $sp, $sp, -56
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
SDARG $r25, $sp, 16
SDARG $r26, $sp, 24
SDARG $r27, $sp, 32
SDARG $r28, $sp, 40
SDARG $r29, $sp, 48
move S0, SRC
move P0, DST
srai.d T0, N, 0x04
srai.d T1, N, 0x03
slli.d T0, T0, 0x04
slli.d T1, T1, 0x03
mul.d P2, M, T0
mul.d P3, M, T1
slli.d P2, P2, 0x03
slli.d P3, P3, 0x03
add.d P2, DST, P2
add.d P3, DST, P3
srai.d T0, N, 0x02
srai.d T1, N, 0x01
slli.d T0, T0, 0x02
slli.d T1, T1, 0x01
mul.d P4, M, T0
mul.d P5, M, T1
slli.d P4, P4, 0x03
slli.d P5, P5, 0x03
add.d P4, DST, P4
add.d P5, DST, P5
slli.d TL, LDA, 0x03
srai.d J, M, 0x03
slli.d T0, TL, 0x01
slli.d T1, M, 0x07
beq ZERO, J, .L_M7
.L_J1: /* J-- */
move S1, S0
add.d S2, S0, TL
add.d S3, S1, T0
add.d S4, S2, T0
add.d S5, S3, T0
add.d S6, S4, T0
add.d S7, S5, T0
add.d S8, S6, T0
add.d S0, S7, T0
move P1, P0
addi.d P0, P0, 0x400
srai.d I, N, 0x04
addi.d J, J, -1
beq ZERO, I, .L_N15
.L_I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S1, 0x40
xvld U3, S1, 0x60
xvld U4, S2, 0x00
xvld U5, S2, 0x20
xvld U6, S2, 0x40
xvld U7, S2, 0x60
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
xvst U4, P1, 0x80
xvst U5, P1, 0xA0
xvst U6, P1, 0xC0
xvst U7, P1, 0xE0
xvld U0, S3, 0x00
xvld U1, S3, 0x20
xvld U2, S3, 0x40
xvld U3, S3, 0x60
xvld U4, S4, 0x00
xvld U5, S4, 0x20
xvld U6, S4, 0x40
xvld U7, S4, 0x60
xvst U0, P1, 0x100
xvst U1, P1, 0x120
xvst U2, P1, 0x140
xvst U3, P1, 0x160
xvst U4, P1, 0x180
xvst U5, P1, 0x1A0
xvst U6, P1, 0x1C0
xvst U7, P1, 0x1E0
xvld U0, S5, 0x00
xvld U1, S5, 0x20
xvld U2, S5, 0x40
xvld U3, S5, 0x60
xvld U4, S6, 0x00
xvld U5, S6, 0x20
xvld U6, S6, 0x40
xvld U7, S6, 0x60
xvst U0, P1, 0x200
xvst U1, P1, 0x220
xvst U2, P1, 0x240
xvst U3, P1, 0x260
xvst U4, P1, 0x280
xvst U5, P1, 0x2A0
xvst U6, P1, 0x2C0
xvst U7, P1, 0x2E0
xvld U0, S7, 0x00
xvld U1, S7, 0x20
xvld U2, S7, 0x40
xvld U3, S7, 0x60
xvld U4, S8, 0x00
xvld U5, S8, 0x20
xvld U6, S8, 0x40
xvld U7, S8, 0x60
xvst U0, P1, 0x300
xvst U1, P1, 0x320
xvst U2, P1, 0x340
xvst U3, P1, 0x360
xvst U4, P1, 0x380
xvst U5, P1, 0x3A0
xvst U6, P1, 0x3C0
xvst U7, P1, 0x3E0
addi.d S1, S1, 0x80
addi.d S2, S2, 0x80
addi.d S3, S3, 0x80
addi.d S4, S4, 0x80
addi.d S5, S5, 0x80
addi.d S6, S6, 0x80
addi.d S7, S7, 0x80
addi.d S8, S8, 0x80
addi.d I, I, -1
add.d P1, P1, T1
blt ZERO, I, .L_I1
.L_N15:
andi I, N, 0x08
beq ZERO, I, .L_N7
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvld U4, S3, 0x00
xvld U5, S3, 0x20
xvld U6, S4, 0x00
xvld U7, S4, 0x20
xvst U0, P2, 0x00
xvst U1, P2, 0x20
xvst U2, P2, 0x40
xvst U3, P2, 0x60
xvst U4, P2, 0x80
xvst U5, P2, 0xA0
xvst U6, P2, 0xC0
xvst U7, P2, 0xE0
xvld U0, S5, 0x00
xvld U1, S5, 0x20
xvld U2, S6, 0x00
xvld U3, S6, 0x20
xvld U4, S7, 0x00
xvld U5, S7, 0x20
xvld U6, S8, 0x00
xvld U7, S8, 0x20
xvst U0, P2, 0x100
xvst U1, P2, 0x120
xvst U2, P2, 0x140
xvst U3, P2, 0x160
xvst U4, P2, 0x180
xvst U5, P2, 0x1A0
xvst U6, P2, 0x1C0
xvst U7, P2, 0x1E0
addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d S5, S5, 0x40
addi.d S6, S6, 0x40
addi.d S7, S7, 0x40
addi.d S8, S8, 0x40
addi.d P2, P2, 0x200
.L_N7:
andi I, N, 0x04
beq ZERO, I, .L_N3
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvst U0, P3, 0x00
xvst U1, P3, 0x20
xvst U2, P3, 0x40
xvst U3, P3, 0x60
xvst U4, P3, 0x80
xvst U5, P3, 0xA0
xvst U6, P3, 0xC0
xvst U7, P3, 0xE0
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
addi.d S5, S5, 0x20
addi.d S6, S6, 0x20
addi.d S7, S7, 0x20
addi.d S8, S8, 0x20
addi.d P3, P3, 0x100
.L_N3:
andi I, N, 0x02
beq ZERO, I, .L_N1
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvld U4, S5, 0x00
xvld U5, S6, 0x00
xvld U6, S7, 0x00
xvld U7, S8, 0x00
xvpermi.q U0, U1, 0x02
xvpermi.q U2, U3, 0x02
xvpermi.q U4, U5, 0x02
xvpermi.q U6, U7, 0x02
xvst U0, P4, 0x00
xvst U2, P4, 0x20
xvst U4, P4, 0x40
xvst U6, P4, 0x60
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d S3, S3, 0x10
addi.d S4, S4, 0x10
addi.d S5, S5, 0x10
addi.d S6, S6, 0x10
addi.d S7, S7, 0x10
addi.d S8, S8, 0x10
addi.d P4, P4, 0x80
.L_N1:
andi I, N, 0x01
beq ZERO, I, .L_N0
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fld.s F2, S2, 0x00
fld.s F3, S2, 0x04
fld.s F4, S3, 0x00
fld.s F5, S3, 0x04
fld.s F6, S4, 0x00
fld.s F7, S4, 0x04
fld.s F8, S5, 0x00
fld.s F9, S5, 0x04
fld.s F10, S6, 0x00
fld.s F11, S6, 0x04
fld.s F12, S7, 0x00
fld.s F13, S7, 0x04
fld.s F14, S8, 0x00
fld.s F15, S8, 0x04
fst.s F0, P5, 0x00
fst.s F1, P5, 0x04
fst.s F2, P5, 0x08
fst.s F3, P5, 0x0c
fst.s F4, P5, 0x10
fst.s F5, P5, 0x14
fst.s F6, P5, 0x18
fst.s F7, P5, 0x1c
fst.s F8, P5, 0x20
fst.s F9, P5, 0x24
fst.s F10, P5, 0x28
fst.s F11, P5, 0x2c
fst.s F12, P5, 0x30
fst.s F13, P5, 0x34
fst.s F14, P5, 0x38
fst.s F15, P5, 0x3c
addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d S3, S3, 0x08
addi.d S4, S4, 0x08
addi.d S5, S5, 0x08
addi.d S6, S6, 0x08
addi.d S7, S7, 0x08
addi.d S8, S8, 0x08
addi.d P5, P5, 0x40
.L_N0:
blt ZERO, J, .L_J1
.L_M7:
andi J, M, 0x04
beq ZERO, J, .L_M3
move S1, S0
add.d S2, S0, TL
add.d S3, S1, T0
add.d S4, S2, T0
add.d S0, S3, T0
move P1, P0
addi.d P0, P0, 0x200
srai.d I, N, 0x04
beq ZERO, I, .L_4N15
.L_4I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S1, 0x40
xvld U3, S1, 0x60
xvld U4, S2, 0x00
xvld U5, S2, 0x20
xvld U6, S2, 0x40
xvld U7, S2, 0x60
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
xvst U4, P1, 0x80
xvst U5, P1, 0xA0
xvst U6, P1, 0xC0
xvst U7, P1, 0xE0
xvld U0, S3, 0x00
xvld U1, S3, 0x20
xvld U2, S3, 0x40
xvld U3, S3, 0x60
xvld U4, S4, 0x00
xvld U5, S4, 0x20
xvld U6, S4, 0x40
xvld U7, S4, 0x60
xvst U0, P1, 0x100
xvst U1, P1, 0x120
xvst U2, P1, 0x140
xvst U3, P1, 0x160
xvst U4, P1, 0x180
xvst U5, P1, 0x1A0
xvst U6, P1, 0x1C0
xvst U7, P1, 0x1E0
addi.d S1, S1, 0x80
addi.d S2, S2, 0x80
addi.d S3, S3, 0x80
addi.d S4, S4, 0x80
addi.d I, I, -1
add.d P1, P1, T1
blt ZERO, I, .L_4I1
.L_4N15:
andi I, N, 0x08
beq ZERO, I, .L_4N7
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvld U4, S3, 0x00
xvld U5, S3, 0x20
xvld U6, S4, 0x00
xvld U7, S4, 0x20
xvst U0, P2, 0x00
xvst U1, P2, 0x20
xvst U2, P2, 0x40
xvst U3, P2, 0x60
xvst U4, P2, 0x80
xvst U5, P2, 0xA0
xvst U6, P2, 0xC0
xvst U7, P2, 0xE0
addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d S3, S3, 0x40
addi.d S4, S4, 0x40
addi.d P2, P2, 0x100
.L_4N7:
andi I, N, 0x04
beq ZERO, I, .L_4N3
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvst U0, P3, 0x00
xvst U1, P3, 0x20
xvst U2, P3, 0x40
xvst U3, P3, 0x60
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
addi.d P3, P3, 0x80
.L_4N3:
andi I, N, 0x02
beq ZERO, I, .L_4N1
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvpermi.q U0, U1, 0x02
xvpermi.q U2, U3, 0x02
xvst U0, P4, 0x00
xvst U2, P4, 0x20
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d S3, S3, 0x10
addi.d S4, S4, 0x10
addi.d P4, P4, 0x40
.L_4N1:
andi I, N, 0x01
beq ZERO, I, .L_M3
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fld.d F2, S3, 0x00
fld.d F3, S4, 0x00
fst.d F0, P5, 0x00
fst.d F1, P5, 0x08
fst.d F2, P5, 0x10
fst.d F3, P5, 0x18
addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d S3, S3, 0x08
addi.d S4, S4, 0x08
addi.d P5, P5, 0x20
.L_M3:
andi J, M, 0x02
beq ZERO, J, .L_M1
move S1, S0
add.d S2, S0, TL
add.d S0, S0, T0
move P1, P0
addi.d P0, P0, 0x100
srai.d I, N, 0x04
beq ZERO, I, .L_2N15
.L_2I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S1, 0x40
xvld U3, S1, 0x60
xvld U4, S2, 0x00
xvld U5, S2, 0x20
xvld U6, S2, 0x40
xvld U7, S2, 0x60
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
xvst U4, P1, 0x80
xvst U5, P1, 0xA0
xvst U6, P1, 0xC0
xvst U7, P1, 0xE0
addi.d S1, S1, 0x80
addi.d S2, S2, 0x80
addi.d I, I, -1
add.d P1, P1, T1
blt ZERO, I, .L_2I1
.L_2N15:
andi I, N, 0x08
beq ZERO, I, .L_2N7
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S2, 0x00
xvld U3, S2, 0x20
xvst U0, P2, 0x00
xvst U1, P2, 0x20
xvst U2, P2, 0x40
xvst U3, P2, 0x60
addi.d S1, S1, 0x40
addi.d S2, S2, 0x40
addi.d P2, P2, 0x80
.L_2N7:
andi I, N, 0x04
beq ZERO, I, .L_2N3
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvst U0, P3, 0x00
xvst U1, P3, 0x20
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d P3, P3, 0x40
.L_2N3:
andi I, N, 0x02
beq ZERO, I, .L_2N1
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvpermi.q U0, U1, 0x02
xvst U0, P4, 0x00
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d P4, P4, 0x20
.L_2N1:
andi I, N, 0x01
beq ZERO, I, .L_M1
fld.d F0, S1, 0x00
fld.d F1, S2, 0x00
fst.d F0, P5, 0x00
fst.d F1, P5, 0x08
addi.d S1, S1, 0x08
addi.d S2, S2, 0x08
addi.d P5, P5, 0x10
.L_M1:
andi J, M, 0x01
beq ZERO, J, .L_M0
move S1, S0
add.d S2, S0, TL
move P1, P0
addi.d P0, P0, 0x80
srai.d I, N, 0x04
beq ZERO, I, .L_1N15
.L_1I1: /* I-- */
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvld U2, S1, 0x40
xvld U3, S1, 0x60
xvst U0, P1, 0x00
xvst U1, P1, 0x20
xvst U2, P1, 0x40
xvst U3, P1, 0x60
addi.d S1, S1, 0x80
addi.d I, I, -1
add.d P1, P1, T1
blt ZERO, I, .L_1I1
.L_1N15:
andi I, N, 0x08
beq ZERO, I, .L_1N7
xvld U0, S1, 0x00
xvld U1, S1, 0x20
xvst U0, P2, 0x00
xvst U1, P2, 0x20
addi.d S1, S1, 0x40
addi.d P2, P2, 0x40
.L_1N7:
andi I, N, 0x04
beq ZERO, I, .L_1N3
xvld U0, S1, 0x00
xvst U0, P3, 0x00
addi.d S1, S1, 0x20
addi.d P3, P3, 0x20
.L_1N3:
andi I, N, 0x02
beq ZERO, I, .L_1N1
fld.d F0, S1, 0x00
fld.d F1, S1, 0x08
fst.d F0, P4, 0x00
fst.d F1, P4, 0x08
addi.d S1, S1, 0x10
addi.d P4, P4, 0x10
.L_1N1:
andi I, N, 0x01
beq ZERO, I, .L_M0
fld.d F0, S1, 0x00
fst.d F0, P5, 0x00
addi.d S1, S1, 0x08
addi.d P5, P5, 0x08
.L_M0:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
LDARG $r25, $sp, 16
LDARG $r26, $sp, 24
LDARG $r27, $sp, 32
LDARG $r28, $sp, 40
LDARG $r29, $sp, 48
addi.d $sp, $sp, 56
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -0,0 +1,306 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Function parameters */
#define M $r4 // param 1: m
#define N $r5 // param 2: n
#define SRC $r6 // param 3: src
#define LDA $r7 // param 4: lda
#define DST $r8 // param 5: dst
#define I $r9
#define J $r10
#define S1 $r12
#define S2 $r13
#define S3 $r14
#define S4 $r15
#define TD $r16
#define TS $r17
#define TL $r18
#define T0 $r19
#define S8 $r20
#define S9 $r23
#define S10 $r11
#define ZERO $r0
#define F0 $f0
#define F1 $f1
#define F2 $f2
#define F3 $f3
#define F4 $f4
#define F5 $f5
#define F6 $f6
#define F7 $f7
/* LASX vectors */
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define U8 $xr8
#define U9 $xr9
#define U10 $xr10
#define U11 $xr11
#define U12 $xr12
#define U13 $xr13
#define U14 $xr14
#define U15 $xr15
PROLOGUE
addi.d $sp, $sp, -8
SDARG $r23, $sp, 0
move TS, SRC //aoffset
move TD, DST //boffset
slli.d TL, LDA, 0x02 //lda
slli.d TL, TL, 0x01 //lda
ori T0, ZERO, 0x03
andn T0, N, T0
mul.w T0, M, T0
slli.d T0, T0, 0x01
slli.d T0, T0, 0x02
add.d S9, DST, T0 //boffset2
ori T0, ZERO, 0x01
andn T0, N, T0
mul.w T0, M, T0
slli.d T0, T0, 0x01
slli.d T0, T0, 0x02
add.d S10, DST, T0 //boffset3
srai.d J, M, 0x02 //j
beq J, ZERO, .L_M1
.L_J1: /* if(j>0) j--*/
move S1, TS //aoffset1
add.d S2, S1, TL
add.d S3, S2, TL
add.d S4, S3, TL
slli.d T0, TL, 0x02
add.d TS, TS, T0
move S8, TD //boffset1
addi.d TD, TD, 0x80
srai.d I, N, 0x02
beq ZERO, I, .L_JN1
.L_JI1: /* if(i>0) i--*/
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvld U2, S3, 0x00
xvld U3, S4, 0x00
xvst U0, S8, 0x00
xvst U1, S8, 0x20
xvst U2, S8, 0x40
xvst U3, S8, 0x60
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
addi.d S3, S3, 0x20
addi.d S4, S4, 0x20
slli.d T0, M, 0x05
add.d S8, S8, T0
addi.d I, I, -1
blt ZERO, I, .L_JI1
.L_JN1: /* if(n&2) */
andi I, N, 0x02
beq ZERO, I, .L_JN2
vld $vr0, S1, 0x00
vld $vr1, S2, 0x00
vld $vr2, S3, 0x00
vld $vr3, S4, 0x00
vst $vr0, S9, 0x00
vst $vr1, S9, 0x10
vst $vr2, S9, 0x20
vst $vr3, S9, 0x30
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d S3, S3, 0x10
addi.d S4, S4, 0x10
addi.d S9, S9, 0x40
.L_JN2: /* if(n&1) */
andi I, N, 0x01
beq ZERO, I, .L_J0
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fld.s F2, S2, 0x00
fld.s F3, S2, 0x04
fld.s F4, S3, 0x00
fld.s F5, S3, 0x04
fld.s F6, S4, 0x00
fld.s F7, S4, 0x04
fst.s F0, S10, 0x00
fst.s F1, S10, 0x04
fst.s F2, S10, 0x08
fst.s F3, S10, 0x0c
fst.s F4, S10, 0x10
fst.s F5, S10, 0x14
fst.s F6, S10, 0x18
fst.s F7, S10, 0x1c
addi.d S10, S10, 0x20
.L_J0:
addi.d J, J, -1
blt ZERO, J, .L_J1
.L_M1: /* if(m&2) */
andi I, M, 0x02
beq ZERO, I, .L_M2
move S1, TS //aoffset1
add.d S2, S1, TL
slli.d T0, TL, 0x01
add.d TS, TS, T0
move S8, TD //boffset1
addi.d TD, TD, 0x40
srai.d I, N, 0x02
beq ZERO, I, .L_M1N1
.L_M1I1: /* if(i>0) */
xvld U0, S1, 0x00
xvld U1, S2, 0x00
xvst U0, S8, 0x00
xvst U1, S8, 0x20
addi.d S1, S1, 0x20
addi.d S2, S2, 0x20
slli.d T0, M, 0x05
add.d S8, S8, T0
addi.d I, I, -1
blt ZERO, I, .L_M1I1
.L_M1N1: /* if(n&2) */
andi I, N, 0x02
beq ZERO, I, .L_M1N2
vld $vr0, S1, 0x00
vld $vr1, S2, 0x00
vst $vr0, S9, 0x00
vst $vr1, S9, 0x10
addi.d S1, S1, 0x10
addi.d S2, S2, 0x10
addi.d S9, S9, 0x20
.L_M1N2: /* if(n&1) */
andi I, N, 0x01
beq ZERO, I, .L_M2
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fld.s F2, S2, 0x00
fld.s F3, S2, 0x04
fst.s F0, S10, 0x00
fst.s F1, S10, 0x04
fst.s F2, S10, 0x08
fst.s F3, S10, 0x0c
addi.d S10, S10, 0x10
.L_M2: /* if(m&1) */
andi I, M, 0x01
beq ZERO, I, .L_M0
move S1, TS //aoffset1
move S8, TD //boffset1
srai.d I, N, 0x02
beq ZERO, I, .L_M2N1
.L_M2I1: /* if(i>0) */
xvld U0, S1, 0x00
xvst U0, S8, 0x00
addi.d S1, S1, 0x20
slli.d T0, M, 0x05
add.d S8, S8, T0
addi.d I, I, -1
blt ZERO, I, .L_M2I1
.L_M2N1: /* if(n&2) */
andi I, N, 0x02
beq ZERO, I, .L_M2N2
vld $vr0, S1, 0x00
vst $vr0, S9, 0x00
addi.d S1, S1, 0x10
.L_M2N2: /* if(n&1) */
andi I, N, 0x01
beq ZERO, I, .L_M0
fld.s F0, S1, 0x00
fld.s F1, S1, 0x04
fst.s F0, S10, 0x00
fst.s F1, S10, 0x04
.L_M0:
LDARG $r23, $sp, 0
addi.d $sp, $sp, 8
jirl $r0, $r1, 0x00
EPILOGUE

View File

@ -0,0 +1,383 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2024/02/20 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M8 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $xr1
#define X0 $xr2
#define X1 $xr3
#define X2 $xr4
#define X3 $xr5
#define X4 $xr6
#define X5 $xr7
#define X6 $xr8
#define X7 $xr9
#define Y0 $xr10
#define Y1 $xr11
#define A0 $xr12
#define A1 $xr13
#define A2 $xr14
#define A3 $xr15
#define A4 $xr16
#define A5 $xr17
#define A6 $xr18
#define A7 $xr19
#define A8 $xr20
#define A9 $xr21
#define A10 $xr22
#define A11 $xr23
#define A12 $xr24
#define A13 $xr25
#define A14 $xr26
#define A15 $xr27
#define TMP0 $xr28
#define TMP1 $xr29
#define TMP2 $xr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro CLOAD_X_8
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_8_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X3, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X4, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X5, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X6, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X7, T0, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_8
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro CLOAD_Y_8_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f14, T0, 0
fldx.d $f15, T0, INC_Y
PTR_ALSL T0, INC_Y, Y, 2
fld.d $f11, T0, 0
fldx.d $f17, T0, INC_Y
PTR_ADD T0, T0, INC_Y
PTR_ADD T0, T0, INC_Y
fld.d $f18, T0, 0
fldx.d $f19, T0, INC_Y
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
.endm
.macro CSTORE_Y_8_GAP
xvstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 3
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 0
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 3
.endm
.macro CGEMV_N_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
.endm
.macro CSTORE_Y_8
GST xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro CLOAD_X_1
GLDREPL xv, d, X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_1
fld.d $f10, Y, 0
.endm
.macro CGEMV_N_1x8
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
Y0, X7, A14, Y0, TMP0, TMP1, TMP2
.endm
.macro CSTORE_Y_1
fst.d $f10, Y, 0
.endm
.macro CGEMV_N_1x1
fld.d $f12, PA0, 0
PTR_ADDI PA0, PA0, 0x08
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
CLOAD_\X_8
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
CLOAD_\Y_8
CGEMV_N_8x8
CSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 3
PTR_ADDI K, K, 8
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
andi I, M, 7
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
CLOAD_\Y_1
CGEMV_N_1x8
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
PTR_ALSL X, INC_X, X, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 7
beqz J, .L_END
.L_\XW\()_N_L1:
CLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
CLOAD_\Y_1
CGEMV_N_1x1
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M8
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
xvpackev.w $xr0, $xr1, $xr0
xvreplve0.d VALPHA, $xr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,342 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2022/02/20 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M8 $r30
#define VALPHA $xr0
#define X0 $xr1
#define X1 $xr2
#define A0 $xr3
#define A1 $xr4
#define A2 $xr5
#define A3 $xr6
#define A4 $xr7
#define A5 $xr8
#define A6 $xr9
#define A7 $xr10
#define A8 $xr11
#define A9 $xr12
#define A10 $xr13
#define A11 $xr14
#define A12 $xr15
#define A13 $xr16
#define A14 $xr17
#define A15 $xr18
#define TP0 $xr19
#define TP1 $xr20
#define TP2 $xr21
#define TP3 $xr22
#define TP4 $xr23
#define TP5 $xr24
#define TP6 $xr25
#define TP7 $xr26
#define TMP0 $xr27
#define TMP1 $xr28
#define TMP2 $xr29
#define Y0 $xr3
#define Y1 $xr4
#define Y2 $xr5
#define Y3 $xr6
#define Y4 $xr7
#define Y5 $xr8
#define Y6 $xr9
#define Y7 $xr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y8
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
.endm
.macro ZERO_Y1
GXOR xv, v, TP0, TP0, TP0
.endm
.macro CLOAD_X8
GLD xv, , X0, X, 0x00, X1, X, 0x20
.endm
.macro CLOAD_X8_GAP
fld.d $f1, X, 0x00
fldx.d $f2, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f3, T0, 0x00
fldx.d $f4, T0, INC_X
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
PTR_ALSL T0, INC_X, X, 2
fld.d $f2, T0, 0x00
fldx.d $f3, T0, INC_X
PTR_ALSL T0, INC_X, T0, 1
fld.d $f4, T0, 0x00
fldx.d $f5, T0, INC_X
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
.endm
.macro CGEMV_T_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \
TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \
TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
.endm
.macro CGEMV_T_LASX XW:req, X8:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
ZERO_Y8
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
CLOAD_\X8
CGEMV_T_8x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
// Accumulated
GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
Y5, TP5, Y6, TP6, Y7, TP7
andi I, M, 7
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
fld.d $f1, X, 0x00
fld.d $f11, PA0, 0x00
fld.d $f12, PA1, 0x00
fld.d $f13, PA2, 0x00
fld.d $f14, PA3, 0x00
fld.d $f15, PA4, 0x00
fld.d $f16, PA5, 0x00
fld.d $f17, PA6, 0x00
fld.d $f18, PA7, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#else
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
PTR_ALSL PY1, INC_Y, Y, 2
fld.d $f15, PY1, 0x00
fldx.d $f16, PY1, INC_Y
PTR_ALSL PY2, INC_Y, PY1, 1
fld.d $f17, PY2, 0x00
fldx.d $f18, PY2, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
fst.d $f15, PY1, 0x00
fstx.d $f16, PY1, INC_Y
fst.d $f17, PY2, 0x00
fstx.d $f18, PY2, INC_Y
PTR_ALSL Y, INC_Y, Y, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 7
beqz J, .L_END
PTR_SUB K_LDA, LDA, M8
.L_\XW\()_N_1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
fld.d $f3, PA0, 0x00
fld.d $f1, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x08
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
fld.d $f3, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
fst.d $f3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
xvpackev.w $xr0, $xr1, $xr0
xvreplve0.d VALPHA, $xr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
CGEMV_T_LASX GAP_0, X8
.L_GAP_1: /* if (incx != 1) */
CGEMV_T_LASX GAP_1, X8_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.endif .endif
.endm .endm
//
// GCOMPLEXACC: Complex accumulate the values of vector registers
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
// suf_op: s or d, differentiate between single precision or double precision complex numbers
// Note: When "pre_op = xvf && suf_op = s", in will be modified.
//
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvpermi.q \out, \in, 0x01
.ifeqs "\suf_op", "s"
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "vf"
.ifeqs "\suf_op", "s"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifnb \more
GCOMPLEXACC \pre_op, \suf_op, \more
.endif
.endm
//
// GCOMPLEXMUL: Complex multiplication, out = in0 * in1
// xconj: default value 0.
// if !(xconj)
// out_r = in0_r * in1_r - in0_i * in1_i;
// out_i = in0_r * in1_i + in0_i * in1_r;
// else
// out_r = in0_r * in1_r + in0_i * in1_i;
// out_i = in0_r * in1_i - in0_i * in1_r;
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
// suf_op: s or d, differentiate between single precision or double precision complex numbers
//
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf"
.ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \tmp1
.else
xvpackod.w \tmp1, \tmp1, \in0
.endif
xvshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.endif
xvshuf4i.d \tmp2, \in1, 0x0b
.endif
.else
.ifeqs "\suf_op", "s"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.endif
.endif
\pre_op\()mul.\suf_op \out, \tmp0, \in1
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
.ifnb \more
GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
.endif
.endm
//
// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
// xconj: default value 0
// conj: default value 0
// if !(CONJ)
// if !(XCONJ)
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
// out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
// else
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
// out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
// else
// if !(XCONJ)
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
// else
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
// suf_op: s or d, differentiate between single precision or double precision complex numbers
//
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
xvpackev.w \tmp0, \in0, \in0
.else
xvpackev.d \tmp0, \in0, \in0
.endif
.else
vxor.v \tmp1, \tmp1, \tmp1
.ifeqs "\suf_op", "s"
vpackev.w \tmp0, \in0, \in0
.else
vpackev.d \tmp0, \in0, \in0
.endif
.endif
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
.ifeqs "\conj", "1"
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
.ifeqs "\pre_op", "xvf"
.ifeqs "\suf_op", "s"
xvshuf4i.w \tmp0, \tmp0, 0xb1
xvpackev.w \out, \tmp0, \tmp2
.else
xvshuf4i.d \tmp0, \tmp0, 0x0b
xvpackev.d \out, \tmp0, \tmp2
.endif
.else
.ifeqs "\suf_op", "s"
vshuf4i.w \tmp0, \tmp0, 0xb1
vpackev.w \out, \tmp0, \tmp2
.else
vshuf4i.d \tmp0, \tmp0, 0x0b
vpackev.d \out, \tmp0, \tmp2
.endif
.endif /* pre_op = xvf */
.else
\pre_op\()add.\suf_op \out, \tmp2, \tmp1
.endif /* conj = 1 */
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
.ifeqs "\pre_op", "xvf"
.ifeqs "\suf_op", "s"
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \tmp1
.else
xvpackod.w \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
xvpackod.w \tmp1, \in0, \in0
.else
xvpackod.w \tmp1, \tmp1, \tmp1
.endif
.endif
xvshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \tmp1
.else
xvpackod.d \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
xvpackod.d \tmp1, \in0, \in0
.else
xvpackod.d \tmp1, \tmp1, \tmp1
.endif
.endif
xvshuf4i.d \tmp2, \in1, 0x0b
.endif
.else
.ifeqs "\suf_op", "s"
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \tmp1
.else
vpackod.w \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.w \tmp1, \in0, \in0
.else
vpackod.w \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.w \tmp2, \in1, 0xb1
.else
.ifeqs "\conj", "0"
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \tmp1
.else
vpackod.d \tmp1, \tmp1, \in0
.endif
.else
.ifeqs "\xconj", "0"
vpackod.d \tmp1, \in0, \in0
.else
vpackod.d \tmp1, \tmp1, \tmp1
.endif
.endif
vshuf4i.d \tmp2, \in1, 0x0b
.endif
.endif
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
.ifnb \more
GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
.endif
.endm
// //
// Media Related Macros // Media Related Macros
// //

View File

@ -0,0 +1,343 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2024/02/20 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M16 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $xr1
#define X0 $xr2
#define X1 $xr3
#define X2 $xr4
#define X3 $xr5
#define X4 $xr6
#define X5 $xr7
#define X6 $xr8
#define X7 $xr9
#define Y0 $xr10
#define Y1 $xr11
#define A0 $xr12
#define A1 $xr13
#define A2 $xr14
#define A3 $xr15
#define A4 $xr16
#define A5 $xr17
#define A6 $xr18
#define A7 $xr19
#define A8 $xr20
#define A9 $xr21
#define A10 $xr22
#define A11 $xr23
#define A12 $xr24
#define A13 $xr25
#define A14 $xr26
#define A15 $xr27
#define TMP0 $xr28
#define TMP1 $xr29
#define TMP2 $xr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro ZLOAD_X_4
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_4_GAP
xvld X0, X, 0
xvpermi.q X0, X0, 0
PTR_ADD T0, X, INC_X
xvld X1, T0, 0
xvpermi.q X1, X1, 0
PTR_ADD T0, T0, INC_X
xvld X2, T0, 0
xvpermi.q X2, X2, 0
PTR_ADD T0, T0, INC_X
xvld X3, T0, 0
xvpermi.q X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_Y_4
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro ZLOAD_Y_4_GAP
vld $vr10, Y, 0
vldx $vr13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
vld $vr11, T0, 0
vldx $vr14, T0, INC_Y
GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
.endm
.macro ZGEMV_N_4x4
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
.endm
.macro ZSTORE_Y_4
GST xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro ZSTORE_Y_4_GAP
xvstelm.d Y0, Y, 0, 0
xvstelm.d Y0, Y, 0x08, 1
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 2
xvstelm.d Y0, T0, 0x08, 3
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 0
xvstelm.d Y1, T0, 0x08, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 2
xvstelm.d Y1, T0, 0x08, 3
.endm
.macro ZLOAD_Y_1
vld $vr10, Y, 0
.endm
.macro ZGEMV_N_1x4
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
.endm
.macro ZSTORE_Y_1
vst $vr10, Y, 0
.endm
.macro ZLOAD_X_1
GLD xv, , X0, X, 0x00
GPERMI xv, q, X0, X0, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x1
GLD_INC v, , 0x10, $vr12, PA0, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L4:
ZLOAD_\X_4
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
ZLOAD_\Y_4
ZGEMV_N_4x4
ZSTORE_\Y_4
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x4
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
PTR_ALSL X, INC_X, X, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
.L_\XW\()_N_L1:
ZLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x1
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M16
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
xvpackev.d $xr0, $xr1, $xr0
xvreplve0.q VALPHA, $xr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,299 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2024/02/20 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M16 $r30
#define VALPHA $xr0
#define X0 $xr1
#define X1 $xr2
#define A0 $xr3
#define A1 $xr4
#define A2 $xr5
#define A3 $xr6
#define A4 $xr7
#define A5 $xr8
#define A6 $xr9
#define A7 $xr10
#define A8 $xr11
#define A9 $xr12
#define A10 $xr13
#define A11 $xr14
#define A12 $xr15
#define A13 $xr16
#define A14 $xr17
#define A15 $xr18
#define TP0 $xr19
#define TP1 $xr20
#define TP2 $xr21
#define TP3 $xr22
#define TP4 $xr23
#define TP5 $xr24
#define TP6 $xr25
#define TP7 $xr26
#define TMP0 $xr27
#define TMP1 $xr28
#define TMP2 $xr29
#define Y0 $xr3
#define Y1 $xr4
#define Y2 $xr5
#define Y3 $xr6
#define Y4 $xr7
#define Y5 $xr8
#define Y6 $xr9
#define Y7 $xr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y4
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
.endm
.macro ZERO_Y1
GXOR xv, v, TP0, TP0, TP0
.endm
.macro ZLOAD_X4
GLD xv, , X0, X, 0x00, X1, X, 0x20
.endm
.macro ZLOAD_X4_GAP
xvld X0, X, 0
PTR_ADD T0, X, INC_X
xvld A0, T0, 0
xvpermi.q X0, A0, 0x02
PTR_ADD T0, T0, INC_X
xvld X1, T0, 0
PTR_ADD T0, T0, INC_X
xvld A0, T0, 0
xvpermi.q X1, A0, 0x02
.endm
.macro ZGEMV_T_4x4
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_T_LASX XW:req, X4:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L4:
ZERO_Y4
move X, X_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
ZLOAD_\X4
ZGEMV_T_4x4
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 2
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
// Accumulated
GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
#else
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
xvld A8, Y, 0x00
xvldx A9, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
xvld A10, PY0, 0x00
xvldx A11, PY0, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
vst $vr11, Y, 0x00
vstx $vr12, Y, INC_Y
vst $vr13, PY0, 0x00
vstx $vr14, PY0, INC_Y
PTR_ALSL Y, INC_Y, Y, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
PTR_SUB K_LDA, LDA, M16
.L_\XW\()_N_1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
GLD xv, , A0, PA0, 0x00, X0, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x10
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
xvld A0, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
vst $vr3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
xvpackev.d $xr0, $xr1, $xr0
xvreplve0.q VALPHA, $xr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
ZGEMV_T_LASX GAP_0, X4
.L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LASX GAP_1, X4_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(DOUBLE) #if !defined(DOUBLE)
#define VSETVL(n) __riscv_vsetvl_e32m8(n) #define VSETVL(n) __riscv_vsetvl_e32m8(n)
#define FLOAT_V_T vfloat32m8_t #define FLOAT_V_T vfloat32m8_t
#define FLOAT_V_M1_T vfloat32m1_t
#define VLEV_FLOAT __riscv_vle32_v_f32m8 #define VLEV_FLOAT __riscv_vle32_v_f32m8
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 #define VLSEV_FLOAT __riscv_vlse32_v_f32m8
#define VSEV_FLOAT __riscv_vse32_v_f32m8 #define VSEV_FLOAT __riscv_vse32_v_f32m8
#define VSEV_FLOAT_M1 __riscv_vse32_v_f32m1
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 #define VSSEV_FLOAT __riscv_vsse32_v_f32m8
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m8
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m8_f32m1
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
#else #else
#define VSETVL(n) __riscv_vsetvl_e64m8(n) #define VSETVL(n) __riscv_vsetvl_e64m8(n)
#define FLOAT_V_T vfloat64m8_t #define FLOAT_V_T vfloat64m8_t
#define FLOAT_V_M1_T vfloat64m1_t
#define VLEV_FLOAT __riscv_vle64_v_f64m8 #define VLEV_FLOAT __riscv_vle64_v_f64m8
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 #define VLSEV_FLOAT __riscv_vlse64_v_f64m8
#define VSEV_FLOAT __riscv_vse64_v_f64m8 #define VSEV_FLOAT __riscv_vse64_v_f64m8
#define VSEV_FLOAT_M1 __riscv_vse64_v_f64m1
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 #define VSSEV_FLOAT __riscv_vsse64_v_f64m8
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 #define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m8
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m8_f64m1
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
#endif #endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
VSEV_FLOAT(y, vy, vl); VSEV_FLOAT(y, vy, vl);
} }
} else if (1 == inc_x) { } else if (1 == inc_x && 0 != inc_y) {
BLASLONG stride_y = inc_y * sizeof(FLOAT); BLASLONG stride_y = inc_y * sizeof(FLOAT);
@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
VSSEV_FLOAT(y, stride_y, vy, vl); VSSEV_FLOAT(y, stride_y, vy, vl);
} }
} else { } else if( 0 == inc_y ) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
size_t in_vl = VSETVL(n);
vy = VFMVVF_FLOAT( y[0], in_vl );
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx = VLSEV_FLOAT(x, stride_x, vl);
vy = VFMACCVF_FLOAT(vy, da, vx, vl);
}
FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 );
vres = VFREDSUMVS_FLOAT( vy, vres, in_vl );
VSEV_FLOAT_M1(y, vres, 1);
} else {
BLASLONG stride_x = inc_x * sizeof(FLOAT); BLASLONG stride_x = inc_x * sizeof(FLOAT);
BLASLONG stride_y = inc_y * sizeof(FLOAT); BLASLONG stride_y = inc_y * sizeof(FLOAT);

View File

@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _) #define VSETVL JOIN(RISCV_RVV(vsetvl), _e, ELEN, LMUL, _)
#define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _) #define FLOAT_V_T JOIN(vfloat, ELEN, LMUL, _t, _)
#define FLOAT_V_M1_T JOIN(vfloat, ELEN, m1, _t, _)
#define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL) #define VLEV_FLOAT JOIN(RISCV_RVV(vle), ELEN, _v_f, ELEN, LMUL)
#define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL) #define VLSEV_FLOAT JOIN(RISCV_RVV(vlse), ELEN, _v_f, ELEN, LMUL)
#define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL) #define VSEV_FLOAT JOIN(RISCV_RVV(vse), ELEN, _v_f, ELEN, LMUL)
#define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL) #define VSSEV_FLOAT JOIN(RISCV_RVV(vsse), ELEN, _v_f, ELEN, LMUL)
#define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _) #define VFMACCVF_FLOAT JOIN(RISCV_RVV(vfmacc), _vf_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, LMUL, _)
#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv), _v_f_f, ELEN, m1, _)
#ifdef RISCV_0p10_INTRINSICS
#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))(v_res, va, vb, gvl)
#else
#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f), ELEN, LMUL, _f, JOIN2( ELEN, m1))
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{ {
@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
VSEV_FLOAT(&y[j], vy0, gvl); VSEV_FLOAT(&y[j], vy0, gvl);
j += gvl; j += gvl;
} }
}else if(inc_x == 1){ } else if (1 == inc_x && 0 != inc_y) {
stride_y = inc_y * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT);
gvl = VSETVL(n); gvl = VSETVL(n);
if(gvl <= n/2){ if(gvl <= n/2){
@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl); VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
j += gvl; j += gvl;
} }
} else if( 0 == inc_y ) {
BLASLONG stride_x = inc_x * sizeof(FLOAT);
size_t in_vl = VSETVL(n);
vy0 = VFMVVF_FLOAT( y[0], in_vl );
for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
vl = VSETVL(n);
vx0 = VLSEV_FLOAT(x, stride_x, vl);
vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl);
}
FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 );
v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl );
y[0] = EXTRACT_FLOAT(v_res);
}else{ }else{
stride_x = inc_x * sizeof(FLOAT); stride_x = inc_x * sizeof(FLOAT);
stride_y = inc_y * sizeof(FLOAT); stride_y = inc_y * sizeof(FLOAT);

View File

@ -101,8 +101,10 @@ SCLAUX = la_constants.o \
slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \ slaset.o slasq1.o slasq2.o slasq3.o slasq4.o slasq5.o slasq6.o \
slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \ slasr.o slasrt.o slassq.o slasv2.o spttrf.o sstebz.o sstedc.o \
ssteqr.o ssterf.o slaisnan.o sisnan.o \ ssteqr.o ssterf.o slaisnan.o sisnan.o \
slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o \ slartgp.o slartgs.o scombssq.o ../INSTALL/sroundup_lwork.o
../INSTALL/second_$(TIMER).o ifneq ($(F_COMPILER), IBM)
SCLAUX += ../INSTALL/second_$(TIMER).o
endif
endif endif
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" ""
@ -124,7 +126,10 @@ DZLAUX = la_constants.o\
dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \ dlasr.o dlasrt.o dlassq.o dlasv2.o dpttrf.o dstebz.o dstedc.o \
dsteqr.o dsterf.o dlaisnan.o disnan.o \ dsteqr.o dsterf.o dlaisnan.o disnan.o \
dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \ dlartgp.o dlartgs.o ../INSTALL/droundup_lwork.o \
../INSTALL/dlamch.o ../INSTALL/dsecnd_$(TIMER).o ../INSTALL/dlamch.o
ifneq ($(F_COMPILER), IBM)
DZLAUX += ../INSTALL/dsecnd_$(TIMER).o
endif
endif endif
#ifeq ($(BUILD_SINGLE),1) #ifeq ($(BUILD_SINGLE),1)

View File

@ -107,6 +107,12 @@ set(ZDMDEIGTST zchkdmd.f90)
macro(add_eig_executable name) macro(add_eig_executable name)
add_executable(${name} ${ARGN}) add_executable(${name} ${ARGN})
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(${name} omp pthread)
endif()
#${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) #${TMGLIB} ../${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
endmacro() endmacro()

View File

@ -240,6 +240,10 @@ set(ZLINTSTRFP zchkrfp.f zdrvrfp.f zdrvrf1.f zdrvrf2.f zdrvrf3.f zdrvrf4.f zerrr
macro(add_lin_executable name) macro(add_lin_executable name)
add_executable(${name} ${ARGN}) add_executable(${name} ${ARGN})
target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE}) target_link_libraries(${name} ${LIBNAMEPREFIX}openblas${LIBNAMESUFFIX}${SUFFIX64_UNDERSCORE})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(${name} omp pthread)
endif()
#${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES}) #${TMGLIB} ${LAPACK_LIBRARIES} ${BLAS_LIBRARIES})
endmacro() endmacro()

12
param.h
View File

@ -2845,21 +2845,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DGEMM_DEFAULT_UNROLL_M 2 #define DGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 1
#else #else
#define DGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 16 #define DGEMM_DEFAULT_UNROLL_M 16
#define SGEMM_DEFAULT_UNROLL_N 8 #define SGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_M 16
#define CGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 16
#define ZGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 8
#endif #endif
#define QGEMM_DEFAULT_UNROLL_N 2 #define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1 #define XGEMM_DEFAULT_UNROLL_N 1
#define QGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 8
#define XGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_P 256

View File

@ -21,10 +21,14 @@ endif()
if (BUILD_COMPLEX16) if (BUILD_COMPLEX16)
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3) list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
endif() endif()
message (STATUS CCOMP ${CMAKE_C_COMPILER_ID} FCOMP ${CMAKE_Fortran_COMPILER_ID})
foreach(test_bin ${OpenBLAS_Tests}) foreach(test_bin ${OpenBLAS_Tests})
add_executable(${test_bin} ${test_bin}.f) add_executable(${test_bin} ${test_bin}.f)
target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME}) target_link_libraries(${test_bin} ${OpenBLAS_LIBNAME})
if (USE_OPENMP AND (${CMAKE_Fortran_COMPILER_ID} STREQUAL GNU) AND (${CMAKE_C_COMPILER_ID} STREQUAL Clang))
string(REGEX REPLACE "-fopenmp" "" CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS}")
target_link_libraries(${test_bin} omp pthread)
endif()
endforeach() endforeach()
# $1 exec, $2 input, $3 output_result # $1 exec, $2 input, $3 output_result