diff --git a/.cirrus.yml b/.cirrus.yml index 8a1c4a0a8..02cd40997 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -30,6 +30,15 @@ task: - cd build - cmake -DTARGET=VORTEX -DCMAKE_C_COMPILER=clang -DBUILD_SHARED_LIBS=ON .. - make + +task: + name: AppleM1/GCC/MAKE/OPENMP + compile_script: + - brew install gcc@11 + - export PATH=/opt/homebrew/bin:$PATH + - export LDFLAGS="-L/opt/homebrew/lib" + - export CPPFLAGS="-I/opt/homebrew/include" + - make CC=gcc-11 FC=gfortran-11 USE_OPENMP=1 macos_instance: image: ghcr.io/cirruslabs/macos-monterey-xcode:latest diff --git a/.github/workflows/dynamic_arch.yml b/.github/workflows/dynamic_arch.yml index c16f87954..4fe6e63fc 100644 --- a/.github/workflows/dynamic_arch.yml +++ b/.github/workflows/dynamic_arch.yml @@ -151,40 +151,53 @@ jobs: strategy: fail-fast: false matrix: - msystem: [MINGW64, MINGW32, CLANG64] + msystem: [MINGW64, MINGW32, CLANG64, CLANG32] idx: [int32, int64] build-type: [Release] include: - msystem: MINGW64 idx: int32 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc - msystem: MINGW32 idx: int32 target-prefix: mingw-w64-i686 - fc-pkg: mingw-w64-i686-gcc-fortran + fc-pkg: fc - msystem: CLANG64 idx: int32 target-prefix: mingw-w64-clang-x86_64 + fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 + - msystem: CLANG32 + idx: int32 + target-prefix: mingw-w64-clang-i686 + fc-pkg: cc c-lapack-flags: -DC_LAPACK=ON - msystem: MINGW64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc - msystem: CLANG64 idx: int64 idx64-flags: -DBINARY=64 -DINTERFACE64=1 target-prefix: mingw-w64-clang-x86_64 - c-lapack-flags: -DC_LAPACK=ON + fc-pkg: fc + # Compiling with Flang 16 seems to cause test errors on machines + # with AVX512 instructions. Revisit after MSYS2 distributes Flang 17. + no-avx512-flags: -DNO_AVX512=1 - msystem: MINGW64 idx: int32 target-prefix: mingw-w64-x86_64 - fc-pkg: mingw-w64-x86_64-gcc-fortran + fc-pkg: fc build-type: None exclude: - msystem: MINGW32 idx: int64 + - msystem: CLANG32 + idx: int64 defaults: run: @@ -209,7 +222,7 @@ jobs: install: >- base-devel ${{ matrix.target-prefix }}-cc - ${{ matrix.fc-pkg }} + ${{ matrix.target-prefix }}-${{ matrix.fc-pkg }} ${{ matrix.target-prefix }}-cmake ${{ matrix.target-prefix }}-ninja ${{ matrix.target-prefix }}-ccache @@ -261,6 +274,7 @@ jobs: -DTARGET=CORE2 \ ${{ matrix.idx64-flags }} \ ${{ matrix.c-lapack-flags }} \ + ${{ matrix.no-avx512-flags }} \ -DCMAKE_C_COMPILER_LAUNCHER=ccache \ -DCMAKE_Fortran_COMPILER_LAUNCHER=ccache \ .. @@ -280,9 +294,22 @@ jobs: key: ${{ steps.ccache-prepare.outputs.key }} - name: Run tests + id: run-ctest timeout-minutes: 60 run: cd build && ctest + - name: Re-run tests + if: always() && (steps.run-ctest.outcome == 'failure') + timeout-minutes: 60 + run: | + cd build + echo "::group::Re-run ctest" + ctest --rerun-failed --output-on-failure || true + echo "::endgroup::" + echo "::group::Log from these tests" + [ ! -f Testing/Temporary/LastTest.log ] || cat Testing/Temporary/LastTest.log + echo "::endgroup::" + cross_build: runs-on: ubuntu-22.04 diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml new file mode 100644 index 000000000..5501e98e0 --- /dev/null +++ b/.github/workflows/loongarch64.yml @@ -0,0 +1,110 @@ +name: loongarch64 qemu test + +on: [push, pull_request] + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: LOONGSONGENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + - target: LOONGSON3R5 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON3R5 + - target: LOONGSON2K1000 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install APT deps + run: | + sudo add-apt-repository ppa:savoury1/virtualisation + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + qemu-user-static + + - name: Download and install loongarch64-toolchain + run: | + wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz + tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + + - name: Set env + run: | + echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Disable utest dsdot:dsdot_n_1 + run: | + echo -n > utest/test_dsdot.c + echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." + + - name: Build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: Test + run: | + qemu-loongarch64-static ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat diff --git a/.gitignore b/.gitignore index 1195bc9b8..8b27325db 100644 --- a/.gitignore +++ b/.gitignore @@ -72,6 +72,7 @@ test/SBLAT3.SUMM test/ZBLAT2.SUMM test/ZBLAT3.SUMM test/SHBLAT3.SUMM +test/SBBLAT3.SUMM test/cblat1 test/cblat2 test/cblat3 @@ -82,6 +83,7 @@ test/sblat1 test/sblat2 test/sblat3 test/test_shgemm +test/test_sbgemm test/zblat1 test/zblat2 test/zblat3 diff --git a/Jenkinsfile b/Jenkinsfile index 5fad6a95b..baeeee59f 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -7,7 +7,7 @@ pipeline { stages { stage('Build') { steps { - sh 'make' + sh 'make clean && make' } } } diff --git a/Jenkinsfile.pwr b/Jenkinsfile.pwr index d141ed8a5..96e18b8ad 100644 --- a/Jenkinsfile.pwr +++ b/Jenkinsfile.pwr @@ -9,7 +9,7 @@ pipeline { steps { sh 'sudo apt update' sh 'sudo apt install gfortran -y' - sh 'make' + sh 'make clean && make' } } } diff --git a/Makefile.system b/Makefile.system index 7d26eccc3..b3968d739 100644 --- a/Makefile.system +++ b/Makefile.system @@ -384,6 +384,11 @@ GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif +ifeq ($(C_COMPILER), CLANG) +CLANGVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) +CLANGVERSIONGTEQ12 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 12) +endif + # # OS dependent settings # @@ -668,6 +673,7 @@ DYNAMIC_CORE += NEOVERSEN1 ifneq ($(NO_SVE), 1) DYNAMIC_CORE += NEOVERSEV1 DYNAMIC_CORE += NEOVERSEN2 +DYNAMIC_CORE += ARMV8SVE endif DYNAMIC_CORE += CORTEXA55 DYNAMIC_CORE += FALKOR @@ -1086,8 +1092,9 @@ endif endif endif -ifeq ($(F_COMPILER), GFORTRAN) +ifeq ($(F_COMPILER), $(filter $(F_COMPILER),GFORTRAN FLANGNEW)) CCOMMON_OPT += -DF_INTERFACE_GFORT +ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive @@ -1101,6 +1108,7 @@ EXTRALIB += -lgfortran endif endif endif +endif ifdef NO_BINARY_MODE ifeq ($(ARCH), $(filter $(ARCH),mips64)) ifdef BINARY64 @@ -1767,6 +1775,8 @@ export TARGET_CORE export NO_AVX512 export NO_AVX2 export BUILD_BFLOAT16 +export NO_LSX +export NO_LASX export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 7ab331b1f..702447ace 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -75,18 +75,31 @@ endif ifeq ($(CORE), COOPERLAKE) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# cooperlake support was added in 10.1 -ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) -CCOMMON_OPT += -march=cooperlake -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=cooperlake -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # cooperlake support was added in 10.1 + ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # cooperlake support was added in clang 9 + ifeq ($(CLANGVERSIONGTEQ9), 1) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables @@ -104,18 +117,31 @@ endif ifeq ($(CORE), SAPPHIRERAPIDS) ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) -# sapphire rapids support was added in 11 -ifeq ($(GCCVERSIONGTEQ11), 1) -CCOMMON_OPT += -march=sapphirerapids -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=sapphirerapids -endif -else # gcc not support, fallback to avx512 -CCOMMON_OPT += -march=skylake-avx512 -ifneq ($(F_COMPILER), NAG) -FCOMMON_OPT += -march=skylake-avx512 -endif -endif + # sapphire rapids support was added in 11 + ifeq ($(GCCVERSIONGTEQ11), 1) + CCOMMON_OPT += -march=sapphirerapids + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=sapphirerapids + endif + else # gcc not support, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif +else ifeq ($(C_COMPILER), CLANG) + # cooperlake support was added in clang 12 + ifeq ($(CLANGVERSIONGTEQ12), 1) + CCOMMON_OPT += -march=cooperlake + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=cooperlake + endif + else # not supported in clang, fallback to avx512 + CCOMMON_OPT += -march=skylake-avx512 + ifneq ($(F_COMPILER), NAG) + FCOMMON_OPT += -march=skylake-avx512 + endif + endif endif ifeq ($(OSNAME), CYGWIN_NT) CCOMMON_OPT += -fno-asynchronous-unwind-tables diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 65ef538e9..ff56ad00b 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -271,6 +271,19 @@ jobs: - script: | make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 +- job: OSX_xbuild_DYNAMIC_ARM64 + pool: + vmImage: 'macOS-11' + variables: + CC: /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang + CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX11.3.sdk -arch arm64 + steps: + - script: | + ls /Applications/Xcode_12.5.1.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs + /Applications/Xcode_12.5.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -arch arm64 --print-supported-cpus + /Applications/Xcode_11.7.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang --version + make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 + - job: ALPINE_MUSL pool: vmImage: 'ubuntu-latest' diff --git a/benchmark/spr.c b/benchmark/spr.c old mode 100755 new mode 100644 diff --git a/benchmark/spr2.c b/benchmark/spr2.c old mode 100755 new mode 100644 diff --git a/c_check b/c_check index 7c8494e4a..4d12c1674 100755 --- a/c_check +++ b/c_check @@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then rm -rf "$tmpd" fi +no_lsx=0 +no_lasx=0 +if [ "$architecture" = "loongarch64" ]; then + tmpd="$(mktemp -d)" + tmplsx="$tmpd/lsx.c" + codelsx='"vadd.b $vr0, $vr0, $vr0"' + lsx_flags='-march=loongarch64 -mlsx' + printf "#include \n\n" >> "$tmplsx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" + args="$lsx_flags -o $tmplsx.o $tmplsx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lsx=1 + } + + tmplasx="$tmpd/lasx.c" + codelasx='"xvadd.b $xr0, $xr0, $xr0"' + lasx_flags='-march=loongarch64 -mlasx' + printf "#include \n\n" >> "$tmplasx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" + args="$lasx_flags -o $tmplasx.o $tmplasx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lasx=1 + } + + rm -rf "$tmpd" +fi + case "$data" in *ARCH_X86_64*) architecture=x86_64 ;; *ARCH_X86*) architecture=x86 ;; @@ -252,6 +283,9 @@ if [ "$architecture" = "arm64" ]; then no_sve=0 { $compiler_name $flags $args >/dev/null 2>&1 + } || { + args=" -Msve_intrinsics -c -o $tmpf.o $tmpf" + $compiler_name $flags $args >/dev/null 2>&1 } || { no_sve=1 } @@ -399,6 +433,8 @@ done [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" } >> "$makefile" os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` @@ -414,6 +450,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" + [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" + [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" } >> "$config" diff --git a/c_check.pl b/c_check.pl index 6ce28e11b..7a860a211 100644 --- a/c_check.pl +++ b/c_check.pl @@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } +$no_lsx = 0; +$no_lasx = 0; +if (($architecture eq "loongarch64")) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; + } else { + $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; + $lsx_flags = "-march=loongarch64 -mlsx"; + print $tmplsx "#include \n\n"; + print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; + + $args = "$lsx_flags -o $tmplsx.o $tmplsx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lsx = 1; + } else { + $no_lsx = 0; + } + unlink("$tmplsx.o"); + + $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; + $lasx_flags = "-march=loongarch64 -mlasx"; + print $tmplasx "#include \n\n"; + print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; + + $args = "$lasx_flags -o $tmplasx.o $tmplasx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lasx = 1; + } else { + $no_lasx = 0; + } + unlink("$tmplasx.o"); + } +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = e2k if ($data =~ /ARCH_E2K/); @@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; +print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; +print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; +print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; +print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; if ($os eq "LINUX") { diff --git a/cmake/arch.cmake b/cmake/arch.cmake index e6e434a0a..ebdc5a833 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -46,7 +46,7 @@ if (DYNAMIC_ARCH) if (ARM64) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99) - set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2) + set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE) endif () if (DYNAMIC_LIST) set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) @@ -135,7 +135,7 @@ if (ARM64) set(BINARY_DEFINED 1) endif () -if (${ARCH} STREQUAL "riscv64") +if (RISCV64) set(NO_BINARY_MODE 1) set(BINARY_DEFINED 1) endif () diff --git a/cmake/cc.cmake b/cmake/cc.cmake index aeaa76710..7b4ef8947 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -180,22 +180,30 @@ endif () if (${CORE} STREQUAL NEOVERSEN2) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") - endif() + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() + endif () endif () endif () if (${CORE} STREQUAL NEOVERSEV1) if (NOT DYNAMIC_ARCH) - execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) - if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") else () - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) + if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.4-a+sve -mtune=neoverse-v1") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") + endif() endif() endif () endif () @@ -213,7 +221,11 @@ endif () if (${CORE} STREQUAL ARMV8SVE) if (NOT DYNAMIC_ARCH) - set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (CCOMMON_OPT "${CCOMMON_OPT} -Msve_intrinsics -march=armv8-a+sve") + else () + set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") + endif () endif () endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index a67760885..c496f6368 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -3,7 +3,8 @@ ## Description: Ported from portion of OpenBLAS/Makefile.system ## Sets Fortran related variables. -if (${F_COMPILER} STREQUAL "FLANG") +if (${F_COMPILER} STREQUAL "FLANG" AND NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # This is for classic Flang. LLVM Flang is handled with gfortran below. set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") if (BINARY64 AND INTERFACE64) set(FCOMMON_OPT "${FCOMMON_OPT} -i8") @@ -38,15 +39,17 @@ if (${F_COMPILER} STREQUAL "G95") endif () endif () -if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") +if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - # ensure reentrancy of lapack codes - set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") - # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") - #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc - if (NOT NO_LAPACK) - set(EXTRALIB "${EXTRALIB} -lgfortran") + if (NOT CMAKE_Fortran_COMPILER_ID STREQUAL "LLVMFlang") + # ensure reentrancy of lapack codes + set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") + # work around ABI violation in passing string arguments from C + set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") + if (NOT NO_LAPACK) + # Don't include -lgfortran, when NO_LAPACK=1 or lsbcc + set(EXTRALIB "${EXTRALIB} -lgfortran") + endif () endif () if (NO_BINARY_MODE) if (MIPS64) @@ -63,6 +66,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95") set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") endif () endif () + if (RISCV64) + if (BINARY64) + if (INTERFACE64) + set(FCOMMON_OPT "${FCOMMON_OPT} -fdefault-integer-8") + endif () + endif () + endif () else () if (BINARY64) set(FCOMMON_OPT "${FCOMMON_OPT} -m64") diff --git a/cmake/system.cmake b/cmake/system.cmake index 414193ec8..bc87f7b44 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -282,23 +282,35 @@ if (DEFINED TARGET) endif() if (${TARGET} STREQUAL NEOVERSEV1) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve_intrinsics -march=armv8.4-a+sve -mtune=neoverse-v1") + else () execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.4-a+sve -mtune=neoverse-v1") else () message(FATAL_ERROR "Compiler ${CMAKE_C_COMPILER} ${GCC_VERSION} does not support Neoverse V1.") endif() + endif() endif() if (${TARGET} STREQUAL NEOVERSEN2) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") + else () execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) if (${GCC_VERSION} VERSION_GREATER 10.4 OR ${GCC_VERSION} VERSION_EQUAL 10.4) set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2") else () message(FATAL_ERROR "Compiler $${CMAKE_C_COMPILER} {GCC_VERSION} does not support Neoverse N2.") endif() + endif() endif() if (${TARGET} STREQUAL ARMV8SVE) + if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI" AND NOT NO_SVE) + set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -Msve-intrinsics -march=armv8.2-a+sve") + else () set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=armv8.2-a+sve") + endif() endif() endif() diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index c59e85d54..49b9863e3 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -44,6 +44,8 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") set(LOONGARCH64 1) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "riscv64.*") + set(RISCV64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") if (NOT BINARY) if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") @@ -60,7 +62,7 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*)") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*|arm64.*|ARM64.*|armv8.*)") if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") set(ARM64 1) else() @@ -107,7 +109,7 @@ else() endif () if (NOT BINARY) - if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) + if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64) set(BINARY 64) else () set(BINARY 32) diff --git a/common_thread.h b/common_thread.h index 05e1d5489..06a7a1a38 100644 --- a/common_thread.h +++ b/common_thread.h @@ -53,7 +53,6 @@ extern void goto_set_num_threads(int nthreads); /* Global Parameter */ extern int blas_cpu_number; extern int blas_num_threads; -extern int blas_num_threads_set; extern int blas_omp_linked; #define BLAS_LEGACY 0x8000U @@ -136,15 +135,13 @@ typedef struct blas_queue { #ifdef SMP_SERVER extern int blas_server_avail; +extern int blas_omp_number_max; static __inline int num_cpu_avail(int level) { #ifdef USE_OPENMP int openmp_nthreads; - if (blas_num_threads_set == 0) openmp_nthreads=omp_get_max_threads(); - else - openmp_nthreads=blas_cpu_number; #endif #ifndef USE_OPENMP @@ -156,7 +153,13 @@ int openmp_nthreads; ) return 1; #ifdef USE_OPENMP - if (blas_cpu_number != openmp_nthreads) { + if (openmp_nthreads > blas_omp_number_max){ +#ifdef DEBUG + fprintf(stderr,"WARNING - more OpenMP threads requested (%d) than available (%d)\n",openmp_nthreads,blas_omp_number_max); +#endif + openmp_nthreads = blas_omp_number_max; + } + if (blas_cpu_number != openmp_nthreads) { goto_set_num_threads(openmp_nthreads); } #endif diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index ca07c7ffb..7c389db27 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LOONGARCH_CFG2 0x02 -#define LOONGARCH_LASX 1<<7 -#define LOONGARCH_LSX 1<<6 +#define LA_HWCAP_LSX (1<<4) +#define LA_HWCAP_LASX (1<<5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - uint32_t reg = 0; + int flag = (int)getauxval(AT_HWCAP); - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(reg) - : "r"(LOONGARCH_CFG2) - ); - - if (reg & LOONGARCH_LASX) + if (flag & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (reg & LOONGARCH_LSX) + else if (flag & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; diff --git a/cpuid_x86.c b/cpuid_x86.c index c2486e380..c485f3ddf 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1551,6 +1551,7 @@ int get_cpuname(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) @@ -2360,6 +2361,7 @@ int get_coretype(void){ case 7: // Raptor Lake case 10: case 15: + case 14: // Alder Lake N #ifndef NO_AVX2 if(support_avx2()) return CORE_HASWELL; diff --git a/ctest/Makefile b/ctest/Makefile index 0fb2450d2..9e85d23b9 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -208,7 +208,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) diff --git a/docs/distributing.md b/docs/distributing.md new file mode 100644 index 000000000..1e6372a28 --- /dev/null +++ b/docs/distributing.md @@ -0,0 +1,270 @@ +# Guidance for redistributing OpenBLAS + +*We note that this document contains recommendations only - packagers and other +redistributors are in charge of how OpenBLAS is built and distributed in their +systems, and may have good reasons to deviate from the guidance given on this +page. These recommendations are aimed at general packaging systems, with a user +base that typically is large, open source (or freely available at least), and +doesn't behave uniformly or that the packager is directly connected with.* + +OpenBLAS has a large number of build-time options which can be used to change +how it behaves at runtime, how artifacts or symbols are named, etc. Variation +in build configuration can be necessary to acheive a given end goal within a +distribution or as an end user. However, such variation can also make it more +difficult to build on top of OpenBLAS and ship code or other packages in a way +that works across many different distros. Here we provide guidance about the +most important build options, what effects they may have when changed, and +which ones to default to. + +The Make and CMake build systems provide equivalent options and yield more or +less the same artifacts, but not exactly (the CMake builds are still +experimental). You can choose either one and the options will function in the +same way, however the CMake outputs may require some renaming. To review +available build options, see `Makefile.rule` or `CMakeLists.txt` in the root of +the repository. + +Build options typically fall into two categories: (a) options that affect the +user interface, such as library and symbol names or APIs that are made +available, and (b) options that affect performance and runtime behavior, such +as threading behavior or CPU architecture-specific code paths. The user +interface options are more important to keep aligned between distributions, +while for the performance-related options there are typically more reasons to +make choices that deviate from the defaults. + +Here are recommendations for user interface related packaging choices where it +is not likely to be a good idea to deviate (typically these are the default +settings): + +1. Include CBLAS. The CBLAS interface is widely used and it doesn't affect + binary size much, so don't turn it off. +2. Include LAPACK and LAPACKE. The LAPACK interface is also widely used, and + while it does make up a significant part of the binary size of the installed + library, that does not outweigh the regression in usability when deviating + from the default here.[^1] +3. Always distribute the pkg-config (`.pc`) and CMake `.cmake`) dependency + detection files. These files are used by build systems when users want to + link against OpenBLAS, and there is no benefit of leaving them out. +4. Provide the LP64 interface by default, and if in addition to that you choose + to provide an ILP64 interface build as well, use a symbol suffix to avoid + symbol name clashes (see the next section). + +[^1] All major distributions do include LAPACK as of mid 2023 as far as we +know. Older versions of Arch Linux did not, and that was known to cause +problems. + + +## ILP64 interface builds + +The LP64 (32-bit integer) interface is the default build, and has +well-established C and Fortran APIs as determined by the reference (Netlib) +BLAS and LAPACK libraries. The ILP64 (64-bit integer) interface however does +not have a standard API: symbol names and shared/static library names can be +produced in multiple ways, and this tends to make it difficult to use. +As of today there is an agreed-upon way of choosing names for OpenBLAS between +a number of key users/redistributors, which is the closest thing to a standard +that there is now. However, there is an ongoing standardization effort in the +reference BLAS and LAPACK libraries, which differs from the current OpenBLAS +agreed-upon convention. In this section we'll aim to explain both. + +Those two methods are fairly similar, and have a key thing in common: *using a +symbol suffix*. This is good practice; it is recommended that if you distribute +an ILP64 build, to have it use a symbol suffix containing `64` in the name. +This avoids potential symbol clashes when different packages which depend on +OpenBLAS load both an LP64 and an ILP64 library into memory at the same time. + +### The current OpenBLAS agreed-upon ILP64 convention + +This convention comprises the shared library name and the symbol suffix in the +shared library. The symbol suffix to use is `64_`, implying that the library +name will be `libopenblas64_.so` and the symbols in that library end in `64_`. +The central issue where this was discussed is +[openblas#646](https://github.com/xianyi/OpenBLAS/issues/646), and adopters +include Fedora, Julia, NumPy and SciPy - SuiteSparse already used it as well. + +To build shared and static libraries with the currently recommended ILP64 +conventions with Make: +```bash +$ make INTERFACE64=1 SYMBOLSUFFIX=64_ +``` + +This will produce libraries named `libopenblas64_.so|a`, a pkg-config file +named `openblas64.pc`, and CMake and header files. + +Installing locally and inspecting the output will show a few more details: +```bash +$ make install PREFIX=$PWD/../openblas/make64 INTERFACE64=1 SYMBOLSUFFIX=64_ +$ tree . # output slightly edited down +. +├── include +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── openblas + │   ├── OpenBLASConfig.cmake + │   └── OpenBLASConfigVersion.cmake + ├── libopenblas64_.a + ├── libopenblas64_.so + └── pkgconfig + └── openblas64.pc +``` + +A key point are the symbol names. These will equal the LP64 symbol names, then +(for Fortran only) the compiler mangling, and then the `64_` symbol suffix. +Hence to obtain the final symbol names, we need to take into account which +Fortran compiler we are using. For the most common cases (e.g., gfortran, Intel +Fortran, or Flang), that means appending a single underscore. In that case, the +result is: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm64_` | n/a | `cblas_dgemm64_(...)` | + +It is quite useful to have these symbol names be as uniform as possible across +different packaging systems. + +The equivalent build options with CMake are: +```bash +$ mkdir build && cd build +$ cmake .. -DINTERFACE64=1 -DSYMBOLSUFFIX=64_ -DBUILD_SHARED_LIBS=ON -DBUILD_STATIC_LIBS=ON +$ cmake --build . -j +``` + +Note that the result is not 100% identical to the Make result. For example, the +library name ends in `_64` rather than `64_` - it is recommended to rename them +to match the Make library names (also update the `libsuffix` entry in +`openblas64.pc` to match that rename). +```bash +$ cmake --install . --prefix $PWD/../../openblas/cmake64 +$ tree . +. +├── include +│   └── openblas64 +│   ├── cblas.h +│   ├── f77blas.h +│   ├── lapacke_config.h +│   ├── lapacke_example_aux.h +│   ├── lapacke.h +│   ├── lapacke_mangling.h +│   ├── lapacke_utils.h +│   ├── lapack.h +│   ├── openblas64 +│   │   └── lapacke_mangling.h +│   └── openblas_config.h +└── lib + ├── cmake + │   └── OpenBLAS64 + │   ├── OpenBLAS64Config.cmake + │   ├── OpenBLAS64ConfigVersion.cmake + │   ├── OpenBLAS64Targets.cmake + │   └── OpenBLAS64Targets-noconfig.cmake + ├── libopenblas_64.a + ├── libopenblas_64.so -> libopenblas_64.so.0 + └── pkgconfig + └── openblas64.pc +``` + + +### The upcoming standardized ILP64 convention + +While the `64_` convention above got some adoption, it's slightly hacky and is +implemented through the use of `objcopy`. An effort is ongoing for a more +broadly adopted convention in the reference BLAS and LAPACK libraries, using +(a) the `_64` suffix, and (b) applying that suffix _before_ rather than after +Fortran compiler mangling. The central issue for this is +[lapack#666](https://github.com/Reference-LAPACK/lapack/issues/666). + +For the most common cases of compiler mangling (a single `_` appended), the end +result will be: + +| base API name | binary symbol name | call from Fortran code | call from C code | +|---------------|--------------------|------------------------|-----------------------| +| `dgemm` | `dgemm_64_` | `dgemm_64(...)` | `dgemm_64_(...)` | +| `cblas_dgemm` | `cblas_dgemm_64` | n/a | `cblas_dgemm_64(...)` | + +For other compiler mangling schemes, replace the trailing `_` by the scheme in use. + +The shared library name for this `_64` convention should be `libopenblas_64.so`. + +Note: it is not yet possible to produce an OpenBLAS build which employs this +convention! Once reference BLAS and LAPACK with support for `_64` have been +released, a future OpenBLAS release will support it. For now, please use the +older `64_` scheme and avoid using the name `libopenblas_64.so`; it should be +considered reserved for future use of the `_64` standard as prescribed by +reference BLAS/LAPACK. + + +## Performance and runtime behavior related build options + +For these options there are multiple reasonable or common choices. + +### Threading related options + +OpenBLAS can be built as a multi-threaded or single-threaded library, with the +default being multi-threaded. It's expected that the default `libopenblas` +library is multi-threaded; if you'd like to also distribute single-threaded +builds, consider naming them `libopenblas_sequential`. + +OpenBLAS can be built with pthreads or OpenMP as the threading model, with the +default being pthreads. Both options are commonly used, and the choice here +should not influence the shared library name. The choice will be captured by +the `.pc` file. E.g.,: +```bash +$ pkg-config --libs openblas +-fopenmp -lopenblas + +$ cat openblas.pc +... +openblas_config= ... USE_OPENMP=0 MAX_THREADS=24 +``` + +The maximum number of threads users will be able to use is determined at build +time by the `NUM_THREADS` build option. It defaults to 24, and there's a wide +range of values that are reasonable to use (up to 256). 64 is a typical choice +here; there is a memory footprint penalty that is linear in `NUM_THREADS`. +Please see `Makefile.rule` for more details. + +### CPU architecture related options + +OpenBLAS contains a lot of CPU architecture-specific optimizations, hence when +distributing to a user base with a variety of hardware, it is recommended to +enable CPU architecture runtime detection. This will dynamically select +optimized kernels for individual APIs. To do this, use the `DYNAMIC_ARCH=1` +build option. This is usually done on all common CPU families, except when +there are known issues. + +In case the CPU architecture is known (e.g. you're building binaries for macOS +M1 users), it is possible to specify the target architecture directly with the +`TARGET=` build option. + +`DYNAMIC_ARCH` and `TARGET` are covered in more detail in the main `README.md` +in this repository. + + +## Real-world examples + +OpenBLAS is likely to be distributed in one of these distribution models: + +1. As a standalone package, or multiple packages, in a packaging ecosystem like + a Linux distro, Homebrew, conda-forge or MSYS2. +2. Vendored as part of a larger package, e.g. in Julia, NumPy, SciPy, or R. +3. Locally, e.g. making available as a build on a single HPC cluster. + +The guidance on this page is most important for models (1) and (2). These links +to build recipes for a representative selection of packaging systems may be +helpful as a reference: + +- [Fedora](https://src.fedoraproject.org/rpms/openblas/blob/rawhide/f/openblas.spec) +- [Debian](https://salsa.debian.org/science-team/openblas/-/blob/master/debian/rules) +- [Homebrew](https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/openblas.rb) +- [MSYS2](https://github.com/msys2/MINGW-packages/blob/master/mingw-w64-openblas/PKGBUILD) +- [conda-forge](https://github.com/conda-forge/openblas-feedstock/blob/main/recipe/build.sh) +- [NumPy/SciPy](https://github.com/MacPython/openblas-libs/blob/main/tools/build_openblas.sh) +- [Nixpkgs](https://github.com/NixOS/nixpkgs/blob/master/pkgs/development/libraries/science/math/openblas/default.nix) diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 051513f27..a8a84acbb 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -973,7 +973,7 @@ void goto_set_num_threads(int num_threads) { increased_threads = 1; - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)0); thread_status[i].status = THREAD_STATUS_WAKEUP; diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 2e0c0f38c..fe6b4a7c0 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -68,6 +68,7 @@ #endif int blas_server_avail = 0; +int blas_omp_number_max = 0; extern int openblas_omp_adaptive_env(); @@ -100,8 +101,6 @@ static void adjust_thread_buffers() { void goto_set_num_threads(int num_threads) { - blas_num_threads_set = 1; - if (num_threads < 0) blas_num_threads_set = 0; if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -125,6 +124,8 @@ void openblas_set_num_threads(int num_threads) { } int blas_thread_init(void){ +if(blas_omp_number_max <= 0) + blas_omp_number_max = omp_get_max_threads(); blas_get_cpu_number(); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index afa33cccc..5bdfc1276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -568,7 +568,7 @@ void goto_set_num_threads(int num_threads) blas_server_avail = 1; } - for(i = blas_num_threads - 1; i < num_threads - 1; i++){ + for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){ blas_threads[i] = CreateThread(NULL, 0, blas_thread_server, (void *)i, diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 0f47b287c..530d18115 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -109,6 +110,11 @@ extern gotoblas_t gotoblas_NEOVERSEN2; #else #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 #endif +#ifdef DYN_ARMV8SVE +extern gotoblas_t gotoblas_ARMV8SVE; +#else +#define gotoblas_ARMV8SVE gotoblas_ARMV8 +#endif #ifdef DYN_CORTEX_A55 extern gotoblas_t gotoblas_CORTEXA55; #else @@ -128,17 +134,21 @@ extern gotoblas_t gotoblas_NEOVERSEN1; #ifndef NO_SVE extern gotoblas_t gotoblas_NEOVERSEV1; extern gotoblas_t gotoblas_NEOVERSEN2; +extern gotoblas_t gotoblas_ARMV8SVE; #else #define gotoblas_NEOVERSEV1 gotoblas_ARMV8 #define gotoblas_NEOVERSEN2 gotoblas_ARMV8 +#define gotoblas_ARMV8SVE gotoblas_ARMV8 #endif extern gotoblas_t gotoblas_THUNDERX3T110; extern gotoblas_t gotoblas_CORTEXA55; #endif extern void openblas_warning(int verbose, const char * msg); +#define FALLBACK_VERBOSE 1 +#define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n" -#define NUM_CORETYPES 13 +#define NUM_CORETYPES 16 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -147,6 +157,9 @@ extern void openblas_warning(int verbose, const char * msg); #ifndef HWCAP_CPUID #define HWCAP_CPUID (1 << 11) #endif +#ifndef HWCAP_SVE +#define HWCAP_SVE (1 << 22) +#endif #define get_cpu_ftr(id, var) ({ \ __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ @@ -168,6 +181,7 @@ static char *corename[] = { "neoversen2", "thunderx3t110", "cortexa55", + "armv8sve", "unknown" }; @@ -187,6 +201,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NEOVERSEN2) return corename[12]; if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13]; if (gotoblas == &gotoblas_CORTEXA55) return corename[14]; + if (gotoblas == &gotoblas_ARMV8SVE) return corename[15]; return corename[NUM_CORETYPES]; } @@ -221,6 +236,7 @@ static gotoblas_t *force_coretype(char *coretype) { case 12: return (&gotoblas_NEOVERSEN2); case 13: return (&gotoblas_THUNDERX3T110); case 14: return (&gotoblas_CORTEXA55); + case 15: return (&gotoblas_ARMV8SVE); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -281,9 +297,17 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_NEOVERSEN1; #ifndef NO_SVE case 0xd49: - return &gotoblas_NEOVERSEN2; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + } else + return &gotoblas_NEOVERSEN2; case 0xd40: - return &gotoblas_NEOVERSEV1; + if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) { + openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK); + return &gotoblas_NEOVERSEN1; + }else + return &gotoblas_NEOVERSEV1; #endif case 0xd05: // Cortex A55 return &gotoblas_CORTEXA55; @@ -332,6 +356,12 @@ static gotoblas_t *get_coretype(void) { snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); openblas_warning(1, coremsg); } +#ifndef NO_SVE + if ((getauxval(AT_HWCAP) & HWCAP_SVE)) { + return &gotoblas_ARMV8SVE; + } +#endif + return NULL; #endif } diff --git a/driver/others/memory.c b/driver/others/memory.c index 3cbd17bc2..4fceae754 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -422,8 +422,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } @@ -1996,8 +1994,6 @@ This value is equal or large than blas_cpu_number. This means some threads are s */ int blas_num_threads = 0; -int blas_num_threads_set = 0; - int goto_get_num_procs (void) { return blas_cpu_number; } diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c index 0b38d1887..6174d9b75 100644 --- a/driver/others/memory_qalloc.c +++ b/driver/others/memory_qalloc.c @@ -283,7 +283,6 @@ The numbers of threads in the thread pool. This value is equal or large than blas_cpu_number. This means some threads are sleep. */ int blas_num_threads = 0; -int blas_num_threads_set = 0; int goto_get_num_procs (void) { return blas_cpu_number; diff --git a/f_check b/f_check index d071e016e..526c41dc6 100755 --- a/f_check +++ b/f_check @@ -101,7 +101,14 @@ else *flang*) vendor=FLANG openmp='-fopenmp' - ;; + data=`$compiler -v 2>&1 > /dev/null ` + v="${data#*version *}" + v="${v%%*.}" + major="${v%%.*}" + if [ "$major" -ge 17 ]; then + vendor=FLANGNEW + fi + ;; *ifort*|*ifx*) vendor=INTEL openmp='-fopenmp' diff --git a/interface/geadd.c b/interface/geadd.c index f0befa14a..3a0ea015d 100644 --- a/interface/geadd.c +++ b/interface/geadd.c @@ -68,7 +68,7 @@ void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, info = 0; - if (lda < MAX(1, m)) info = 6; + if (lda < MAX(1, m)) info = 5; if (ldc < MAX(1, m)) info = 8; if (n < 0) info = 2; diff --git a/interface/nrm2.c b/interface/nrm2.c index dc8c08e9a..331ebc3d0 100644 --- a/interface/nrm2.c +++ b/interface/nrm2.c @@ -54,6 +54,21 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ if (n <= 0) return 0.; +#ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif IDEBUG_START; FUNCTION_PROFILE_START(); @@ -82,6 +97,22 @@ FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ if (n <= 0) return 0.; + #ifndef COMPLEX + if (n == 1) +#ifdef DOUBLE + return fabs(x[0]); +#else + return fabsf(x[0]); +#endif +#endif + + if (incx < 0) +#ifdef COMPLEX + x -= (n - 1) * incx * 2; +#else + x -= (n - 1) * incx; +#endif + IDEBUG_START; FUNCTION_PROFILE_START(); diff --git a/kernel/Makefile b/kernel/Makefile index d426a1bdb..1e0a0074f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -33,7 +33,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ11), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(CLANGVERSIONGTEQ12))) override CFLAGS += -march=sapphirerapids else override CFLAGS += -march=skylake-avx512 -mavx512f @@ -48,7 +48,7 @@ ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) endif else ifeq ($(TARGET_CORE), COOPERLAKE) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq (1, $(filter 1,$(GCCVERSIONGTEQ10) $(CLANGVERSIONGTEQ9))) override CFLAGS += -march=cooperlake else override CFLAGS += -march=skylake-avx512 -mavx512f @@ -77,6 +77,12 @@ else ifeq ($(TARGET_CORE), ZEN) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) else ifeq ($(TARGET_CORE), LOONGSON3R4) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) +else ifneq ($(filter NEOVERSEN2 NEOVERSEV1, $(TARGET_CORE)),) + ifeq ($(C_COMPILER), PGI) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -Msve_intrinsics + else + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) + endif else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index bea6cb048..174a1d41b 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -35,6 +35,12 @@ USE_TRMM = 1 endif endif +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), MIPS64_GENERIC) +USE_TRMM = 1 +endif +endif + ifeq ($(CORE), HASWELL) USE_TRMM = 1 endif diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index 07393624c..ccbce27e1 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -57,7 +57,7 @@ CAMAXKERNEL = zamax.S ZAMAXKERNEL = zamax.S SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S +DAXPYKERNEL = daxpy_thunderx2t99.S CAXPYKERNEL = zaxpy.S ZAXPYKERNEL = zaxpy.S @@ -81,45 +81,35 @@ DGEMVTKERNEL = gemv_t.S CGEMVTKERNEL = zgemv_t.S ZGEMVTKERNEL = zgemv_t.S +SASUMKERNEL = sasum_thunderx2t99.c +DASUMKERNEL = dasum_thunderx2t99.c +CASUMKERNEL = casum_thunderx2t99.c +ZASUMKERNEL = zasum_thunderx2t99.c -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S +SCOPYKERNEL = copy_thunderx2t99.c +DCOPYKERNEL = copy_thunderx2t99.c +CCOPYKERNEL = copy_thunderx2t99.c +ZCOPYKERNEL = copy_thunderx2t99.c -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S +SSWAPKERNEL = swap_thunderx2t99.S +DSWAPKERNEL = swap_thunderx2t99.S +CSWAPKERNEL = swap_thunderx2t99.S +ZSWAPKERNEL = swap_thunderx2t99.S -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S +ISAMAXKERNEL = iamax_thunderx2t99.c +IDAMAXKERNEL = iamax_thunderx2t99.c +ICAMAXKERNEL = izamax_thunderx2t99.c +IZAMAXKERNEL = izamax_thunderx2t99.c -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S +SNRM2KERNEL = scnrm2_thunderx2t99.c +DNRM2KERNEL = dznrm2_thunderx2t99.c +CNRM2KERNEL = scnrm2_thunderx2t99.c +ZNRM2KERNEL = dznrm2_thunderx2t99.c -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -DDOTKERNEL = dot.S -ifneq ($(C_COMPILER), PGI) -SDOTKERNEL = ../generic/dot.c -else -SDOTKERNEL = dot.S -endif -ifneq ($(C_COMPILER), PGI) -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -else -CDOTKERNEL = ../arm/zdot.c -ZDOTKERNEL = ../arm/zdot.c -endif +DDOTKERNEL = dot.c +SDOTKERNEL = dot.c +CDOTKERNEL = zdot_thunderx2t99.c +ZDOTKERNEL = zdot_thunderx2t99.c DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S @@ -170,8 +160,8 @@ DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -CGEMMINCOPY = cgemm_ncopy_sve_v1.c -CGEMMITCOPY = cgemm_tcopy_sve_v1.c +CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c CGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c @@ -194,8 +184,8 @@ CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S -ZGEMMINCOPY = zgemm_ncopy_sve_v1.c -ZGEMMITCOPY = zgemm_tcopy_sve_v1.c +ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZGEMMITCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c diff --git a/kernel/arm64/KERNEL.NEOVERSEV1 b/kernel/arm64/KERNEL.NEOVERSEV1 index d6617e8a4..bc5999097 100644 --- a/kernel/arm64/KERNEL.NEOVERSEV1 +++ b/kernel/arm64/KERNEL.NEOVERSEV1 @@ -1,98 +1 @@ include $(KERNELDIR)/KERNEL.ARMV8SVE - -DAXPYKERNEL = daxpy_thunderx2t99.S - -SASUMKERNEL = sasum_thunderx2t99.c -DASUMKERNEL = dasum_thunderx2t99.c -CASUMKERNEL = casum_thunderx2t99.c -ZASUMKERNEL = zasum_thunderx2t99.c - -SCOPYKERNEL = copy_thunderx2t99.c -DCOPYKERNEL = copy_thunderx2t99.c -CCOPYKERNEL = copy_thunderx2t99.c -ZCOPYKERNEL = copy_thunderx2t99.c - -SSWAPKERNEL = swap_thunderx2t99.S -DSWAPKERNEL = swap_thunderx2t99.S -CSWAPKERNEL = swap_thunderx2t99.S -ZSWAPKERNEL = swap_thunderx2t99.S - -ISAMAXKERNEL = iamax_thunderx2t99.c -IDAMAXKERNEL = iamax_thunderx2t99.c -ICAMAXKERNEL = izamax_thunderx2t99.c -IZAMAXKERNEL = izamax_thunderx2t99.c - -SNRM2KERNEL = scnrm2_thunderx2t99.c -DNRM2KERNEL = dznrm2_thunderx2t99.c -CNRM2KERNEL = scnrm2_thunderx2t99.c -ZNRM2KERNEL = dznrm2_thunderx2t99.c - -DDOTKERNEL = dot.c -SDOTKERNEL = dot.c -CDOTKERNEL = zdot_thunderx2t99.c -ZDOTKERNEL = zdot_thunderx2t99.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRMMUNCOPY_M = -CTRMMLNCOPY_M = -CTRMMUTCOPY_M = -CTRMMLTCOPY_M = -CHEMMLTCOPY_M = -CHEMMUTCOPY_M = -CSYMMUCOPY_M = -CSYMMLCOPY_M = - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -CGEMMINCOPYOBJ = -CGEMMITCOPYOBJ = -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMCOPYLN_M = -ZTRSMCOPYLT_M = -ZTRSMCOPYUN_M = -ZTRSMCOPYUT_M = - -ZTRMMUNCOPY_M = -ZTRMMLNCOPY_M = -ZTRMMUTCOPY_M = -ZTRMMLTCOPY_M = -ZHEMMLTCOPY_M = -ZHEMMUTCOPY_M = -ZSYMMUCOPY_M = -ZSYMMLCOPY_M = - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -else -ZGEMMINCOPYOBJ = -ZGEMMITCOPYOBJ = -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) diff --git a/kernel/arm64/cgemm_kernel_sve_v1x4.S b/kernel/arm64/cgemm_kernel_sve_v1x4.S index 38770f66b..2136ebbee 100644 --- a/kernel/arm64/cgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/cgemm_kernel_sve_v1x4.S @@ -240,7 +240,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +275,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rw z15.s, p0/z, [pB, 28] add pB, pB, 32 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +309,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.s, p1/m, z2.s, z15.s ld1rw z15.s, p0/z, [pB, 28] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 32 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -341,10 +333,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z3.s, z15.s OP_ri z23.s, p1/m, z2.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -383,13 +371,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ii z22.s, p1/m, z1.s, z15.s OP_ri z23.s, p1/m, z0.s, z15.s - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -407,8 +391,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - ld2w {z28.s, z29.s}, p1/z, [pCRow2] fmla z28.s, p1/m, z20.s, alphaz_R fmls z28.s, p1/m, z21.s, alphaz_I @@ -425,12 +407,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.s, p1/m, z23.s, alphaz_R st2w {z30.s, z31.s}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +444,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -485,10 +461,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2w {z26.s, z27.s}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #3 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +488,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2w {z24.s, z25.s}, p1/z, [pCRow0] fmla z24.s, p1/m, z16.s, alphaz_R fmls z24.s, p1/m, z17.s, alphaz_I @@ -527,8 +497,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 2 *4 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +521,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, s0 dup alphaz_R, alphaR fmov alphaI, s1 @@ -676,10 +641,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lcgemm_kernel_L4_Mv1_46 .Lcgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lcgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/cgemm_ncopy_sve_v1.c b/kernel/arm64/cgemm_ncopy_sve_v1.c index 6aa44a8f6..2fdaf5fcd 100644 --- a/kernel/arm64/cgemm_ncopy_sve_v1.c +++ b/kernel/arm64/cgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); diff --git a/kernel/arm64/cgemm_tcopy_sve_v1.c b/kernel/arm64/cgemm_tcopy_sve_v1.c index 748cd954e..086a2fed1 100644 --- a/kernel/arm64/cgemm_tcopy_sve_v1.c +++ b/kernel/arm64/cgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b32(j, n); + svbool_t pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); uint32_t active = svcntp_b32(svptrue_b32(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntw(); - pg = svwhilelt_b32(j, n); + pg = svwhilelt_b32((uint64_t)j, (uint64_t)n); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/dot_kernel_sve.c b/kernel/arm64/dot_kernel_sve.c index 8460e0d5e..9c057551e 100644 --- a/kernel/arm64/dot_kernel_sve.c +++ b/kernel/arm64/dot_kernel_sve.c @@ -50,8 +50,8 @@ static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { BLASLONG sve_width = SVE_WIDTH; for (BLASLONG i = 0; i < n; i += sve_width * 2) { - svbool_t pg_a = SVE_WHILELT(i, n); - svbool_t pg_b = SVE_WHILELT(i + sve_width, n); + svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); + svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); diff --git a/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c new file mode 100644 index 000000000..90f867b44 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_complex_sve_v1x4.c @@ -0,0 +1,121 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec_real = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + a_vec_imag = svld1_gather_index(pg, a_offset_inner + 1, lda_vec); \ + svst2(pg, b_offset, svcreate2(a_vec_real, a_vec_imag)); \ + a_offset_inner += 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda * 2); + SV_TYPE a_vec_real; + SV_TYPE a_vec_imag; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/gemm_ncopy_sve_v1x8.c b/kernel/arm64/gemm_ncopy_sve_v1x8.c index 113b1ee40..7b2a2e767 100644 --- a/kernel/arm64/gemm_ncopy_sve_v1x8.c +++ b/kernel/arm64/gemm_ncopy_sve_v1x8.c @@ -107,7 +107,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { BLASLONG remaining_n = n - single_vectors_n; if (remaining_n) { a_offset_inner = a_offset; - svbool_t pg = SV_WHILE(0L, remaining_n); + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); uint64_t active = remaining_n; uint64_t i_cnt = m >> 2; while (i_cnt--) { diff --git a/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c new file mode 100644 index 000000000..975166a2e --- /dev/null +++ b/kernel/arm64/gemm_tcopy_complex_sve_v1x4.c @@ -0,0 +1,115 @@ +/*************************************************************************** +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64x2_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32x2_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld2(pg, a_offset_inner); \ + svst2(pg, b_offset, a_vec); \ + a_offset_inner += lda * 2; \ + b_offset += active * 2; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + BLASLONG single_vectors_n = n & -sve_size; + for (BLASLONG j = 0; j < single_vectors_n; j += sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * 2; + } + + BLASLONG remaining_n = n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + + diff --git a/kernel/arm64/gemm_tcopy_sve_v1x8.c b/kernel/arm64/gemm_tcopy_sve_v1x8.c index 68a2cc07c..9a93b6cb7 100644 --- a/kernel/arm64/gemm_tcopy_sve_v1x8.c +++ b/kernel/arm64/gemm_tcopy_sve_v1x8.c @@ -100,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ BLASLONG remaining_n = n - single_vectors_n; if (remaining_n) { a_offset_inner = a_offset; - svbool_t pg = SV_WHILE(0L, remaining_n); + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); uint64_t active = remaining_n; uint64_t i_cnt = m >> 2; while (i_cnt--) { diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S old mode 100755 new mode 100644 diff --git a/kernel/arm64/symm_lcopy_sve.c b/kernel/arm64/symm_lcopy_sve.c index 6ba4afc8b..e138f0647 100644 --- a/kernel/arm64/symm_lcopy_sve.c +++ b/kernel/arm64/symm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/symm_ucopy_sve.c b/kernel/arm64/symm_ucopy_sve.c index 32da5bd16..9a4cb6d4f 100644 --- a/kernel/arm64/symm_ucopy_sve.c +++ b/kernel/arm64/symm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -86,7 +87,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -133,7 +134,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/trmm_lncopy_sve_v1.c b/kernel/arm64/trmm_lncopy_sve_v1.c index 918e945ac..c7f79e3fd 100644 --- a/kernel/arm64/trmm_lncopy_sve_v1.c +++ b/kernel/arm64/trmm_lncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_ltcopy_sve_v1.c b/kernel/arm64/trmm_ltcopy_sve_v1.c index b76cc56de..b3ba68973 100644 --- a/kernel/arm64/trmm_ltcopy_sve_v1.c +++ b/kernel/arm64/trmm_ltcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -122,11 +122,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_uncopy_sve_v1.c b/kernel/arm64/trmm_uncopy_sve_v1.c index 75fa163ae..a47d2096c 100644 --- a/kernel/arm64/trmm_uncopy_sve_v1.c +++ b/kernel/arm64/trmm_uncopy_sve_v1.c @@ -52,11 +52,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -123,11 +123,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trmm_utcopy_sve_v1.c b/kernel/arm64/trmm_utcopy_sve_v1.c index 36a03242a..c5188beb4 100644 --- a/kernel/arm64/trmm_utcopy_sve_v1.c +++ b/kernel/arm64/trmm_utcopy_sve_v1.c @@ -51,10 +51,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -121,11 +121,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_lncopy_sve.c b/kernel/arm64/trsm_lncopy_sve.c index 5a9d4194a..2895eb85d 100644 --- a/kernel/arm64/trsm_lncopy_sve.c +++ b/kernel/arm64/trsm_lncopy_sve.c @@ -56,13 +56,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +106,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_ltcopy_sve.c b/kernel/arm64/trsm_ltcopy_sve.c index ac4019e26..fdda992e0 100644 --- a/kernel/arm64/trsm_ltcopy_sve.c +++ b/kernel/arm64/trsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_uncopy_sve.c b/kernel/arm64/trsm_uncopy_sve.c index 8fdcd0f4b..1a03aa974 100644 --- a/kernel/arm64/trsm_uncopy_sve.c +++ b/kernel/arm64/trsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -56,13 +57,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/trsm_utcopy_sve.c b/kernel/arm64/trsm_utcopy_sve.c index 0f5f0dccd..b06166f36 100644 --- a/kernel/arm64/trsm_utcopy_sve.c +++ b/kernel/arm64/trsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -55,12 +56,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -104,11 +105,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/zdot_thunderx2t99.c b/kernel/arm64/zdot_thunderx2t99.c index 728f97fb3..6f65e5cfd 100644 --- a/kernel/arm64/zdot_thunderx2t99.c +++ b/kernel/arm64/zdot_thunderx2t99.c @@ -24,7 +24,12 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#if (NVCOMPVERS < 2309) +#pragma opt 1 +#endif +#endif #include "common.h" diff --git a/kernel/arm64/zgemm_kernel_sve_v1x4.S b/kernel/arm64/zgemm_kernel_sve_v1x4.S index d5b35775c..a043948d6 100644 --- a/kernel/arm64/zgemm_kernel_sve_v1x4.S +++ b/kernel/arm64/zgemm_kernel_sve_v1x4.S @@ -239,8 +239,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M1 @@ -276,9 +274,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1rd z15.d, p0/z, [pB, 56] add pB, pB, 64 - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - - prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNELv1x4_M2 @@ -313,11 +308,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri z23.d, p1/m, z2.d, z15.d ld1rd z15.d, p0/z, [pB, 56] - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - add pB, pB, 64 - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] .endm .macro KERNELv1x4_E @@ -340,11 +331,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z3.d, z14.d OP_ii z22.d, p1/m, z3.d, z15.d OP_ri z23.d, p1/m, z2.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] - .endm .macro KERNELv1x4_SUB @@ -382,14 +368,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ir z23.d, p1/m, z1.d, z14.d OP_ii z22.d, p1/m, z1.d, z15.d OP_ri z23.d, p1/m, z0.d, z15.d - - prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - prfm PLDL1KEEP, [pA, #A_PRE_SIZE] .endm .macro SAVEv1x4 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -407,7 +388,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2d {z28.d, z29.d}, p1/z, [pCRow2] fmla z28.d, p1/m, z20.d, alphaz_R @@ -425,12 +405,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmla z31.d, p1/m, z23.d, alphaz_R st2d {z30.d, z31.d}, p1, [pCRow3] - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - add pCRow3, pCRow3, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -466,8 +442,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x2 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -485,10 +459,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. st2d {z26.d, z27.d}, p1, [pCRow1] add pCRow1, pCRow1, lanes, lsl #4 - prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - - prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -516,8 +486,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVEv1x1 - prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2d {z24.d, z25.d}, p1/z, [pCRow0] fmla z24.d, p1/m, z16.d, alphaz_R fmls z24.d, p1/m, z17.d, alphaz_I @@ -527,8 +495,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow0, pCRow0, lanes, lsl #4 // pC = pC + lanes * 2 *8 - prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - .endm /******************************************************************************/ @@ -553,9 +519,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - prfm PLDL1KEEP, [origPB] - prfm PLDL1KEEP, [origPA] - fmov alphaR, d0 dup alphaz_R, alphaR fmov alphaI, d1 @@ -676,10 +639,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. bne .Lzgemm_kernel_L4_Mv1_46 .Lzgemm_kernel_L4_Mv1_100: - prfm PLDL1KEEP, [pA] - prfm PLDL1KEEP, [pA, #64] - prfm PLDL1KEEP, [origPB] - SAVEv1x4 .Lzgemm_kernel_L4_Mv1_END: diff --git a/kernel/arm64/zgemm_ncopy_sve_v1.c b/kernel/arm64/zgemm_ncopy_sve_v1.c index 8f9b4268a..6b8c93baf 100644 --- a/kernel/arm64/zgemm_ncopy_sve_v1.c +++ b/kernel/arm64/zgemm_ncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,7 +53,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -69,7 +70,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * lda * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); diff --git a/kernel/arm64/zgemm_tcopy_sve_v1.c b/kernel/arm64/zgemm_tcopy_sve_v1.c index c6e50bc1c..fd8d2190f 100644 --- a/kernel/arm64/zgemm_tcopy_sve_v1.c +++ b/kernel/arm64/zgemm_tcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -50,7 +51,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ boffset = b; j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); uint64_t active = svcntp_b64(svptrue_b64(), pg); do { @@ -66,7 +67,7 @@ int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ aoffset += active * 2; j += svcntd(); - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); diff --git a/kernel/arm64/zhemm_ltcopy_sve.c b/kernel/arm64/zhemm_ltcopy_sve.c index 37dbfe4e1..fcf7e7073 100644 --- a/kernel/arm64/zhemm_ltcopy_sve.c +++ b/kernel/arm64/zhemm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -79,7 +80,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -99,7 +100,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -117,7 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(cmp, gat_ind, lda_vec); gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, 2); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zhemm_utcopy_sve.c b/kernel/arm64/zhemm_utcopy_sve.c index 21e03b7be..056c9824e 100644 --- a/kernel/arm64/zhemm_utcopy_sve.c +++ b/kernel/arm64/zhemm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,7 +55,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -80,7 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b64(offset, 0LL); + svbool_t off_g = svwhilelt_b64((int64_t)offset, (int64_t)0LL); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -100,7 +101,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); #else @@ -116,7 +117,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t j = 0; int32_t N = n; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -142,7 +143,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON gat_ind = svadd_m(svnot_z(pg, cmp) , gat_ind, lda_vec); data_vec_imag = svneg_z(pg, data_vec_imag); if (offset <= 0) { - svbool_t off_g = svwhilelt_b32(offset, 0); + svbool_t off_g = svwhilelt_b32((int32_t)offset, (int32_t)0); data_vec_imag = svneg_m(data_vec_imag, off_g, data_vec_imag); } @@ -162,7 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_lcopy_sve.c b/kernel/arm64/zsymm_lcopy_sve.c index 6f18aa956..5a17d3b19 100644 --- a/kernel/arm64/zsymm_lcopy_sve.c +++ b/kernel/arm64/zsymm_lcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/zsymm_ucopy_sve.c b/kernel/arm64/zsymm_ucopy_sve.c index 6be48cdaf..06989e3aa 100644 --- a/kernel/arm64/zsymm_ucopy_sve.c +++ b/kernel/arm64/zsymm_ucopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,7 +54,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON svint64_t one_vec = svdup_s64(1LL); int64_t j = 0; - svbool_t pg = svwhilelt_b64(j, n); + svbool_t pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); int64_t active = svcntp_b64(svptrue_b64(), pg); svint64_t index_neg = svindex_s64(0LL, -1LL); svint64_t index = svindex_s64(0LL, 1LL); @@ -90,7 +91,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s64(posX); j += sve_size; - pg = svwhilelt_b64(j, n); + pg = svwhilelt_b64((uint64_t)j, (uint64_t)n); active = svcntp_b64(svptrue_b64(), pg); } while (svptest_any(svptrue_b64(), pg)); @@ -103,7 +104,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON int32_t N = n; int32_t j = 0; - svbool_t pg = svwhilelt_b32(j, N); + svbool_t pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); int32_t active = svcntp_b32(svptrue_b32(), pg); svint32_t index_neg = svindex_s32(0, -1); svint32_t index = svindex_s32(0, 1); @@ -140,7 +141,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posX += sve_size; posX_vec = svdup_s32(posX); j += sve_size; - pg = svwhilelt_b32(j, N); + pg = svwhilelt_b32((uint32_t)j, (uint32_t)N); active = svcntp_b32(svptrue_b32(), pg); } while (svptest_any(svptrue_b32(), pg)); diff --git a/kernel/arm64/ztrmm_lncopy_sve_v1.c b/kernel/arm64/ztrmm_lncopy_sve_v1.c index d34f607ab..5a7171d9d 100644 --- a/kernel/arm64/ztrmm_lncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_lncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_ltcopy_sve_v1.c b/kernel/arm64/ztrmm_ltcopy_sve_v1.c index 7f34c9857..3a88f26b2 100644 --- a/kernel/arm64/ztrmm_ltcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_ltcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -129,11 +130,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_uncopy_sve_v1.c b/kernel/arm64/ztrmm_uncopy_sve_v1.c index 7eb9452c9..c3dbdcbe3 100644 --- a/kernel/arm64/ztrmm_uncopy_sve_v1.c +++ b/kernel/arm64/ztrmm_uncopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -54,11 +55,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; #ifdef DOUBLE svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -132,11 +133,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrmm_utcopy_sve_v1.c b/kernel/arm64/ztrmm_utcopy_sve_v1.c index 60c8ff3b4..ddfa7ba4e 100644 --- a/kernel/arm64/ztrmm_utcopy_sve_v1.c +++ b/kernel/arm64/ztrmm_utcopy_sve_v1.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -53,10 +54,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON FLOAT *ao; js = 0; #ifdef DOUBLE - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else - svbool_t pn = svwhilelt_b32(js, n); + svbool_t pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do @@ -128,11 +129,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON posY += n_active; js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, n); + pn = svwhilelt_b32((uint64_t)js, (uint64_t)n); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_lncopy_sve.c b/kernel/arm64/ztrsm_lncopy_sve.c index eb7cd0294..f81ba14c2 100644 --- a/kernel/arm64/ztrsm_lncopy_sve.c +++ b/kernel/arm64/ztrsm_lncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_ltcopy_sve.c b/kernel/arm64/ztrsm_ltcopy_sve.c index 34dbf8a30..46a11abed 100644 --- a/kernel/arm64/ztrsm_ltcopy_sve.c +++ b/kernel/arm64/ztrsm_ltcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_uncopy_sve.c b/kernel/arm64/ztrsm_uncopy_sve.c index 92e086b75..436112130 100644 --- a/kernel/arm64/ztrsm_uncopy_sve.c +++ b/kernel/arm64/ztrsm_uncopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -52,13 +53,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT #ifdef DOUBLE int64_t js = 0; svint64_t index = svindex_s64(0LL, lda); - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; svint32_t index = svindex_s32(0, lda); - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -106,11 +107,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/arm64/ztrsm_utcopy_sve.c b/kernel/arm64/ztrsm_utcopy_sve.c index ccb942e1b..ddf3e265f 100644 --- a/kernel/arm64/ztrsm_utcopy_sve.c +++ b/kernel/arm64/ztrsm_utcopy_sve.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2023 The OpenBLAS Project */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -51,12 +52,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT jj = offset; #ifdef DOUBLE int64_t js = 0; - svbool_t pn = svwhilelt_b64(js, n); + svbool_t pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); int n_active = svcntp_b64(svptrue_b64(), pn); #else int32_t N = n; int32_t js = 0; - svbool_t pn = svwhilelt_b32(js, N); + svbool_t pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); int n_active = svcntp_b32(svptrue_b32(), pn); #endif do { @@ -102,11 +103,11 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT js += n_active; #ifdef DOUBLE - pn = svwhilelt_b64(js, n); + pn = svwhilelt_b64((uint64_t)js, (uint64_t)n); n_active = svcntp_b64(svptrue_b64(), pn); } while (svptest_any(svptrue_b64(), pn)); #else - pn = svwhilelt_b32(js, N); + pn = svwhilelt_b32((uint32_t)js, (uint32_t)N); n_active = svcntp_b32(svptrue_b32(), pn); } while (svptest_any(svptrue_b32(), pn)); #endif diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c old mode 100755 new mode 100644 diff --git a/kernel/generic/ztrsm_utcopy_1.c b/kernel/generic/ztrsm_utcopy_1.c index 08f85e891..5833a64ef 100644 --- a/kernel/generic/ztrsm_utcopy_1.c +++ b/kernel/generic/ztrsm_utcopy_1.c @@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT BLASLONG i, ii, j, jj; - FLOAT data01, data02; + FLOAT data01=0.0, data02=0.0; FLOAT *a1; lda *= 2; diff --git a/kernel/generic/ztrsm_utcopy_2.c b/kernel/generic/ztrsm_utcopy_2.c index 387bb2532..bc495f7c6 100644 --- a/kernel/generic/ztrsm_utcopy_2.c +++ b/kernel/generic/ztrsm_utcopy_2.c @@ -47,6 +47,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT FLOAT data05, data06, data07, data08; FLOAT *a1, *a2; + data01=data02=data07=data08=0.0; lda *= 2; jj = offset; diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cda359040..67d1fd11c 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,3 +1,4 @@ +ifndef NO_LASX DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S @@ -8,7 +9,26 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMVNKERNEL = dgemv_n_8_lasx.S +DGEMVTKERNEL = dgemv_t_8_lasx.S + +SGEMMKERNEL = sgemm_kernel_16x8_lasx.S +SGEMMINCOPY = sgemm_ncopy_16_lasx.S +SGEMMITCOPY = sgemm_tcopy_16_lasx.S +SGEMMONCOPY = sgemm_ncopy_8_lasx.S +SGEMMOTCOPY = sgemm_tcopy_8_lasx.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c diff --git a/kernel/loongarch64/KERNEL.generic b/kernel/loongarch64/KERNEL.generic index b772a6f82..213add9ee 100644 --- a/kernel/loongarch64/KERNEL.generic +++ b/kernel/loongarch64/KERNEL.generic @@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c ZSWAPKERNEL = ../arm/zswap.c SGEMVNKERNEL = ../arm/gemv_n.c +ifndef DGEMVNKERNEL DGEMVNKERNEL = ../arm/gemv_n.c +endif CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c +ifndef DGEMVTKERNEL DGEMVTKERNEL = ../arm/gemv_t.c +endif CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = ../arm/zgemv_t.c diff --git a/kernel/loongarch64/cnrm2.S b/kernel/loongarch64/cnrm2.S index 9d27987e1..41667485a 100644 --- a/kernel/loongarch64/cnrm2.S +++ b/kernel/loongarch64/cnrm2.S @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 srai.d I, N, 2 bge $r0, I, .L25 LD a1, X, 0 * SIZE diff --git a/kernel/loongarch64/dgemm_kernel_16x4.S b/kernel/loongarch64/dgemm_kernel_16x4.S index 13faa977e..f8e26fda2 100644 --- a/kernel/loongarch64/dgemm_kernel_16x4.S +++ b/kernel/loongarch64/dgemm_kernel_16x4.S @@ -28,6 +28,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +/********************************************************************* +* 2023/06/28 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/06/28 guxiwei +* Parameter: +* DGEMM_DEFAULT_UNROLL_N 4 +* DGEMM_DEFAULT_UNROLL_M 16 +* DGEMM_DEFAULT_P 32 +* DGEMM_DEFAULT_Q 152 +* DGEMM_DEFAULT_R 858 +* A_PR1 1024 +* B_PR1 256 +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 36.0 GFLOPS +* 2 threads: 71.6 GFLOPS +* 3 threads: 101.5 GFLOPS +* 4 threads: 132.8 GFLOPS +*********************************************************************/ + /* Function parameters */ #define M $r4 // param 1: bm #define N $r5 // param 2: bn @@ -68,31 +93,1005 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define U4 $xr4 #define U5 $xr5 #define U6 $xr6 -#define D0 $xr7 -#define D1 $xr8 -#define D2 $xr9 -#define D3 $xr10 -#define D4 $xr11 -#define D5 $xr12 -#define D6 $xr13 -#define D7 $xr14 -#define D8 $xr15 -#define D9 $xr16 -#define D10 $xr17 -#define D11 $xr18 -#define D12 $xr19 -#define D13 $xr20 -#define D14 $xr21 -#define D15 $xr22 -#define VALPHA $xr23 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 +#define VALPHA $xr15 /* Prefetch interval */ -#define A_PRE 0x200 +#define A_PRE 0x400 #define B_PRE 0x100 +.macro KERNEL2x16x4 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x16x4_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + preld 0, B0, B_PRE + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D10, U10, U14, D10 + xvfmadd.d D11, U11, U14, D11 + + preld 0, A0, A_PRE + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + preld 0, A0, A_PRE + 0x40 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D14, U10, U15, D14 + xvfmadd.d D15, U11, U15, D15 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + preld 0, B0, B_PRE + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 + + preld 0, A0, A_PRE + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + preld 0, A0, A_PRE + 0x40 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 +.endm + +.macro KERNEL8x16x4 +.rept 4 + KERNEL2x16x4 +.endr +.endm + +.macro KERNEL8x16x4_END +.rept 3 + KERNEL2x16x4 +.endr + KERNEL2x16x4_END +.endm + +.macro KERNEL2x8x4 + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x8x4_END + xvld U0, A0, 0x00 + xvld U1, A0, 0x20 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + xvfmadd.d D9, U9, U14, D9 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + xvfmadd.d D13, U9, U15, D13 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 +.endm + +.macro KERNEL8x8x4 +.rept 4 + KERNEL2x8x4 +.endr +.endm + +.macro KERNEL8x8x4_END +.rept 3 + KERNEL2x8x4 +.endr + KERNEL2x8x4_END +.endm + +.macro KERNEL2x4x4 + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvld U8, A0, 0x00 + + xvldrepl.d U12, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + + xvldrepl.d U13, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U14, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + + xvldrepl.d U15, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x4x4_END + xvld U0, A0, 0x00 + + xvldrepl.d U4, B0, 0x00 + xvfmadd.d D0, U8, U12, D0 + + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U8, U14, D8 + + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U8, U15, D12 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D12, U0, U7, D12 +.endm + +.macro KERNEL8x4x4 +.rept 4 + KERNEL2x4x4 +.endr +.endm + +.macro KERNEL8x4x4_END +.rept 3 + KERNEL2x4x4 +.endr + KERNEL2x4x4_END +.endm + +.macro KERNEL2x2x4 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x2x4_END + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 + + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U4, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x2x4 +.rept 4 + KERNEL2x2x4 +.endr +.endm + +.macro KERNEL8x2x4_END +.rept 3 + KERNEL2x2x4 +.endr + KERNEL2x2x4_END +.endm + +.macro KERNEL2x1x4 + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvldrepl.d U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvld U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 +.endm + +.macro KERNEL2x1x4_END + xvldrepl.d U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvld U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x4 +.rept 4 + KERNEL2x1x4 +.endr +.endm + +.macro KERNEL8x1x4_END +.rept 3 + KERNEL2x1x4 +.endr + KERNEL2x1x4_END +.endm + +.macro KERNEL2x16x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvld U11, A0, 0x60 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x16x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvld U3, A0, 0x60 + xvfmadd.d D6, U10, U13, D6 + xvfmadd.d D7, U11, U13, D7 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 +.endm + +.macro KERNEL8x16x2 +.rept 4 + KERNEL2x16x2 +.endr +.endm + +.macro KERNEL8x16x2_END +.rept 3 + KERNEL2x16x2 +.endr + KERNEL2x16x2_END +.endm + +.macro KERNEL2x8x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x8x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D4, U8, U13, D4 + xvfmadd.d D5, U9, U13, D5 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 +.endm + +.macro KERNEL8x8x2 +.rept 4 + KERNEL2x8x2 +.endr +.endm + +.macro KERNEL8x8x2_END +.rept 3 + KERNEL2x8x2 + .endr + KERNEL2x8x2_END +.endm + +.macro KERNEL2x4x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x4x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x4x2 +.rept 4 + KERNEL2x4x2 +.endr +.endm + +.macro KERNEL8x4x2_END +.rept 3 + KERNEL2x4x2 +.endr + KERNEL2x4x2_END +.endm + +.macro KERNEL2x2x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x2x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x2x2 +.rept 4 + KERNEL2x2x2 +.endr +.endm + +.macro KERNEL8x2x2_END +.rept 3 + KERNEL2x2x2 +.endr + KERNEL2x2x2_END +.endm + +.macro KERNEL2x1x2 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 +.endm + +.macro KERNEL2x1x2_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D4, U8, U13, D4 + + xvldrepl.d U4, B0, 0x00 + xvldrepl.d U5, B0, 0x08 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D4, U0, U5, D4 +.endm + +.macro KERNEL8x1x2 +.rept 4 + KERNEL2x1x2 +.endr +.endm + +.macro KERNEL8x1x2_END +.rept 3 + KERNEL2x1x2 +.endr + KERNEL2x1x2_END +.endm + +.macro KERNEL2x16x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvld U9, A0, 0x20 + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 + + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x16x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + + xvld U1, A0, 0x20 + xvfmadd.d D2, U10, U12, D2 + xvfmadd.d D3, U11, U12, D3 + + xvld U2, A0, 0x40 + xvld U3, A0, 0x60 + + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + + xvfmadd.d D2, U2, U4, D2 + xvfmadd.d D3, U3, U4, D3 +.endm + +.macro KERNEL8x16x1 +.rept 4 + KERNEL2x16x1 +.endr +.endm + +.macro KERNEL8x16x1_END +.rept 3 + KERNEL2x16x1 +.endr + KERNEL2x16x1_END +.endm + +.macro KERNEL2x8x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 + xvld U9, A0, 0x20 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x8x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvfmadd.d D1, U9, U12, D1 + xvld U1, A0, 0x20 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 +.endm + +.macro KERNEL8x8x1 +.rept 4 + KERNEL2x8x1 +.endr +.endm + +.macro KERNEL8x8x1_END +.rept 3 + KERNEL2x8x1 +.endr + KERNEL2x8x1_END +.endm + +.macro KERNEL2x4x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x4x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x4x1 +.rept 4 + KERNEL2x4x1 +.endr +.endm + +.macro KERNEL8x4x1_END +.rept 3 + KERNEL2x4x1 +.endr + KERNEL2x4x1_END +.endm + +.macro KERNEL2x2x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x2x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x2x1 +.rept 4 + KERNEL2x2x1 +.endr +.endm + +.macro KERNEL8x2x1_END +.rept 3 + KERNEL2x2x1 +.endr + KERNEL2x2x1_END +.endm + +.macro KERNEL2x1x1 + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvld U8, A0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvldrepl.d U12, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 +.endm + +.macro KERNEL2x1x1_END + xvld U0, A0, 0x00 + xvfmadd.d D0, U8, U12, D0 + xvldrepl.d U4, B0, 0x00 + + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + xvfmadd.d D0, U0, U4, D0 +.endm + +.macro KERNEL8x1x1 +.rept 4 + KERNEL2x1x1 +.endr +.endm + +.macro KERNEL8x1x1_END +.rept 3 + KERNEL2x1x1 +.endr + KERNEL2x1x1_END +.endm + + PROLOGUE - addi.d $sp, $sp, -56 + addi.d $sp, $sp, -120 /* Store regs */ SDARG $r23, $sp, 0 SDARG $r24, $sp, 8 @@ -100,11 +1099,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. SDARG $r26, $sp, 24 SDARG $r27, $sp, 32 ST $f23, $sp, 40 - ST ALPHA, $sp, 48 - - /* VALPHA = {ALPHA, ALPHA, ALPHA, ALPHA} */ - xvld VALPHA, $sp, 48 - xvreplve0.d VALPHA, VALPHA + ST $f24, $sp, 48 + ST $f25, $sp, 56 + ST $f26, $sp, 64 + ST $f27, $sp, 72 + ST $f28, $sp, 80 + ST $f29, $sp, 88 + ST $f30, $sp, 96 + ST $f31, $sp, 104 + ST ALPHA, $sp, 112 #if defined (TRMMKERNEL) && !defined(LEFT) sub.d OFF, ZERO, OFFSET @@ -115,6 +1118,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (!(N >> 2)) goto L_N3 */ srai.d J, N, 2 /* J = bn >> 2 */ andi N, N, 0x03 + xvldrepl.d VALPHA, $sp, 112 /* When N < 4, VALPHA will not changed */ beq ZERO, J, .L_N3 .L_J1: /* J-- && This loop include Condition 1 */ @@ -183,32 +1187,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D2, U2, U4 xvfmul.d D3, U3, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 preld 0, C1, 0x00 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 preld 0, C1, 0x40 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 preld 0, C2, 0x00 /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 preld 0, C2, 0x40 - xvfmul.d D10, U2, U4 - xvfmul.d D11, U3, U4 + xvfmul.d D10, U2, U6 + xvfmul.d D11, U3, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 preld 0, C3, 0x00 /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 preld 0, C3, 0x40 - xvfmul.d D14, U2, U4 - xvfmul.d D15, U3, U4 + xvfmul.d D14, U2, U7 + xvfmul.d D15, U3, U7 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x80 @@ -219,315 +1223,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_L7 */ beq ZERO,TL, .L_L7 - /* Calculate 8 sets of D0~D15 */ + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_TL1_END .L_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - preld 0, B0, B_PRE - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - preld 0, A0, A_PRE - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 - preld 0, A0, A_PRE + 0x40 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x20 - + KERNEL8x16x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_TL1 +.L_TL1_END: + KERNEL8x16x4_END + /* Maybe we need calculate the last * 7 sets of D0~D15? */ @@ -550,23 +1268,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - xvfmadd.d D10, U2, U4, D10 - xvfmadd.d D11, U3, U4, D11 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 + xvfmadd.d D10, U2, U6, D10 + xvfmadd.d D11, U3, U6, D11 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - xvfmadd.d D14, U2, U4, D14 - xvfmadd.d D15, U3, U4, D15 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 + xvfmadd.d D14, U2, U7, D14 + xvfmadd.d D15, U3, U7, D15 /* Add stride for A0, B0 */ addi.d A0, A0, 0x80 @@ -576,6 +1294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_L71 .L_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA xvfmul.d D1, D1, VALPHA @@ -605,24 +1324,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvld U2, C2, 0x40 - xvld U3, C2, 0x60 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 - xvfmadd.d D10, D10, VALPHA, U2 - xvfmadd.d D11, D11, VALPHA, U3 + xvld U8, C2, 0x00 + xvld U9, C2, 0x20 + xvld U10, C2, 0x40 + xvld U11, C2, 0x60 + xvfmadd.d D8, D8, VALPHA, U8 + xvfmadd.d D9, D9, VALPHA, U9 + xvfmadd.d D10, D10, VALPHA, U10 + xvfmadd.d D11, D11, VALPHA, U11 /* Load C3 */ xvld U0, C3, 0x00 @@ -727,20 +1446,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D0, U0, U4 xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 - xvfmul.d D9, U1, U4 + xvfmul.d D8, U0, U6 + xvfmul.d D9, U1, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 - xvfmul.d D13, U1, U4 + xvfmul.d D12, U0, U7 + xvfmul.d D13, U1, U7 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x40 @@ -751,195 +1470,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_M8_L7 */ beq ZERO,TL, .L_M8_L7 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M8_TL1_END + .L_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 - - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x20 + KERNEL8x8x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M8_TL1 +.L_M8_TL1_END: + KERNEL8x8x4_END + .L_M8_L7: /* if (!(L & 7)) goto L_M8_L0 */ andi TL, L, 7 @@ -953,17 +1506,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - xvfmadd.d D9, U1, U4, D9 + xvldrepl.d U6, B0, 0x10 + xvfmadd.d D8, U0, U6, D8 + xvfmadd.d D9, U1, U6, D9 - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - xvfmadd.d D13, U1, U4, D13 + xvldrepl.d U7, B0, 0x18 + xvfmadd.d D12, U0, U7, D12 + xvfmadd.d D13, U1, U7, D13 /* Add stride for A0, B0 */ addi.d A0, A0, 0x40 @@ -973,6 +1526,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M8_L71 .L_M8_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA xvfmul.d D1, D1, VALPHA @@ -990,22 +1544,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 /* Load C2 */ - xvld U0, C2, 0x00 - xvld U1, C2, 0x20 - xvfmadd.d D8, D8, VALPHA, U0 - xvfmadd.d D9, D9, VALPHA, U1 + xvld U4, C2, 0x00 + xvld U5, C2, 0x20 + xvfmadd.d D8, D8, VALPHA, U4 + xvfmadd.d D9, D9, VALPHA, U5 /* Load C3 */ - xvld U0, C3, 0x00 - xvld U1, C3, 0x20 - xvfmadd.d D12, D12, VALPHA, U0 - xvfmadd.d D13, D13, VALPHA, U1 + xvld U6, C3, 0x00 + xvld U7, C3, 0x20 + xvfmadd.d D12, D12, VALPHA, U6 + xvfmadd.d D13, D13, VALPHA, U7 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -1085,17 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* line 1 */ xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 - xvldrepl.d U4, B0, 0x10 + xvldrepl.d U6, B0, 0x10 /* line 3 */ - xvfmul.d D8, U0, U4 + xvfmul.d D8, U0, U6 - xvldrepl.d U4, B0, 0x18 + xvldrepl.d U7, B0, 0x18 /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D12, U0, U7 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x20 @@ -1106,153 +1660,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_M4_L7 */ beq ZERO,TL, .L_M4_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + xvldrepl.d U14, B0, 0x10 + xvldrepl.d U15, B0, 0x18 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M4_TL1_END + .L_M4_TL1: /* TL-- */ - /***8-1***/ - xvld U0, A0, 0x00 + KERNEL8x4x4 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 + addi.d TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x20 - - addi.d TL, TL, -1 /* TL-- */ - blt ZERO,TL, .L_M4_TL1 +.L_M4_TL1_END: + KERNEL8x4x4_END .L_M4_L7: /* if (!(L & 7)) goto L_M4_L0 */ @@ -1282,6 +1710,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M4_L71 .L_M4_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA xvfmul.d D4, D4, VALPHA @@ -1293,16 +1722,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 + xvld U2, C2, 0x00 + xvfmadd.d D8, D8, VALPHA, U2 /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 + xvld U3, C3, 0x00 + xvfmadd.d D12, D12, VALPHA, U3 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -1372,23 +1801,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 + xvld U4, B0, 0x00 - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 - - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 + xvfmul.d D0, U0, U4 + xvfmul.d D1, U1, U4 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x10 @@ -1399,154 +1818,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_M2_L7 */ beq ZERO,TL, .L_M2_L7 + xvldrepl.d U8, A0, 0x00 + xvldrepl.d U9, A0, 0x08 + + addi.d TL, TL, -1 + + xvld U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M2_TL1_END .L_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x20 + KERNEL8x2x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M2_TL1 +.L_M2_TL1_END: + KERNEL8x2x4_END .L_M2_L7: /* if (!(L & 7)) goto L_M2_L0 */ @@ -1554,20 +1842,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq TL, ZERO,.L_M2_L0 .L_M2_L71: - xvld U0, A0, 0x00 + xvldrepl.d U0, A0, 0x00 + xvldrepl.d U1, A0, 0x08 - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 + xvfmadd.d D1, U1, U4, D1 /* Add stride for A0, B0 */ addi.d A0, A0, 0x10 addi.d B0, B0, 0x20 @@ -1576,37 +1857,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M2_L71 .L_M2_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D12, D12, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - - /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 - - /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 - - /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) + xvfmul.d D1, D1, VALPHA xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 - xvstelm.d D0, C0, 0x08, 0x01 - xvstelm.d D4, C1, 0x08, 0x01 - xvstelm.d D8, C2, 0x08, 0x01 - xvstelm.d D12, C3, 0x08, 0x01 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 + xvstelm.d D1, C0, 0x08, 0x00 + xvstelm.d D1, C1, 0x08, 0x01 + xvstelm.d D1, C2, 0x08, 0x02 + xvstelm.d D1, C3, 0x08, 0x03 +#else + xvpackev.d D4, D1, D0 + xvpackod.d D5, D1, D0 + /* Load C0 */ + xvld U0, C0, 0x00 + /* Load C1 */ + xvld U1, C1, 0x00 + /* Load C2 */ + xvld U2, C2, 0x00 + /* Load C3 */ + xvld U3, C3, 0x00 + + xvpermi.q U2, U0, 0x20 + xvpermi.q U3, U1, 0x20 + + xvfmadd.d D0, D4, VALPHA, U2 + xvfmadd.d D1, D5, VALPHA, U3 + + vst $vr16, C0, 0x00 + vst $vr17, C1, 0x00 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D1, C3, 0x00, 0x02 + xvstelm.d D0, C2, 0x08, 0x03 + xvstelm.d D1, C3, 0x08, 0x03 +#endif // #if defined(TRMMKERNEL) /* Add stride for C */ addi.d C0, C0, 0x10 @@ -1666,24 +1954,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. move L, K /* L = bk */ #endif - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - /* line 1 */ - xvfmul.d D0, U0, U4 - - xvldrepl.d U4, B0, 0x08 - /* line 2 */ - xvfmul.d D4, U0, U4 - - xvldrepl.d U4, B0, 0x10 - /* line 3 */ - xvfmul.d D8, U0, U4 - - xvldrepl.d U4, B0, 0x18 - /* line 4 */ - xvfmul.d D12, U0, U4 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmul.d D0, U0, U4 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x08 @@ -1694,154 +1967,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_M1_L7 */ beq ZERO,TL, .L_M1_L7 + xvldrepl.d U8, A0, 0x00 + + addi.d TL, TL, -1 + xvld U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x20 + + beq ZERO, TL, .L_M1_TL1_END + .L_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x20 + KERNEL8x1x4 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_M1_TL1 +.L_M1_TL1_END: + KERNEL8x1x4_END .L_M1_L7: /* if (!(L & 7)) goto L_M1_L0 */ @@ -1849,19 +1990,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. beq TL, ZERO,.L_M1_L0 .L_M1_L71: - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - xvldrepl.d U4, B0, 0x10 - xvfmadd.d D8, U0, U4, D8 - - xvldrepl.d U4, B0, 0x18 - xvfmadd.d D12, U0, U4, D12 + xvldrepl.d U0, A0, 0x00 + xvld U4, B0, 0x00 + xvfmadd.d D0, U0, U4, D0 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -1871,33 +2002,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. blt ZERO,TL, .L_M1_L71 .L_M1_L0: + xvldrepl.d VALPHA, $sp, 112 #if defined(TRMMKERNEL) xvfmul.d D0, D0, VALPHA - xvfmul.d D4, D4, VALPHA - xvfmul.d D8, D8, VALPHA - xvfmul.d D12, D12, VALPHA -#else - /* Load C0 */ - xvld U0, C0, 0x00 - xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ - - /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 - - /* Load C2 */ - xvld U0, C2, 0x00 - xvfmadd.d D8, D8, VALPHA, U0 - - /* Load C3 */ - xvld U0, C3, 0x00 - xvfmadd.d D12, D12, VALPHA, U0 -#endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 - xvstelm.d D4, C1, 0x00, 0x00 - xvstelm.d D8, C2, 0x00, 0x00 - xvstelm.d D12, C3, 0x00, 0x00 + xvstelm.d D0, C1, 0x00, 0x01 + xvstelm.d D0, C2, 0x00, 0x02 + xvstelm.d D0, C3, 0x00, 0x03 +#else + /* Load C0 */ + xvldrepl.d U0, C0, 0x00 + xvfmadd.d D4, D0, VALPHA, U0 + + /* Load C1 */ + xvldrepl.d U1, C1, 0x00 + xvfmadd.d D5, D0, VALPHA, U1 + + /* Load C2 */ + xvldrepl.d U2, C2, 0x00 + xvfmadd.d D6, D0, VALPHA, U2 + + /* Load C3 */ + xvldrepl.d U3, C3, 0x00 + xvfmadd.d D7, D0, VALPHA, U3 + + xvstelm.d D4, C0, 0x00, 0x00 + xvstelm.d D5, C1, 0x00, 0x01 + xvstelm.d D6, C2, 0x00, 0x02 + xvstelm.d D7, C3, 0x00, 0x03 +#endif // #if defined(TRMMKERNEL) /* Add stride for C */ addi.d C0, C0, 0x08 @@ -1952,6 +2086,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ///////////////////////////////////////////////// /************************ Condition 1 if((N >> 2) && (M >> 4)) END !!! ************************/ + xvldrepl.d VALPHA, $sp, 112 + .L_N3: andi J, N, 2 beq ZERO, J, .L_N1 @@ -2015,12 +2151,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D2, U2, U4 xvfmul.d D3, U3, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 - xvfmul.d D6, U2, U4 - xvfmul.d D7, U3, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 + xvfmul.d D6, U2, U5 + xvfmul.d D7, U3, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x80 @@ -2031,185 +2167,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_L7 */ beq ZERO,TL, .L_N3_L7 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_TL1_END + .L_N3_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 - - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x10 + KERNEL8x16x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_TL1 +.L_N3_TL1_END: + KERNEL8x16x2_END .L_N3_L7: /* if (!(L & 7)) goto L_N3_L0 */ @@ -2229,12 +2207,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D2, U2, U4, D2 xvfmadd.d D3, U3, U4, D3 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - xvfmadd.d D6, U2, U4, D6 - xvfmadd.d D7, U3, U4, D7 - + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 + xvfmadd.d D6, U2, U5, D6 + xvfmadd.d D7, U3, U5, D7 /* Add stride for A0, B0 */ addi.d A0, A0, 0x80 addi.d B0, B0, 0x10 @@ -2264,14 +2241,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D3, D3, VALPHA, U3 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvld U2, C1, 0x40 - xvld U3, C1, 0x60 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 - xvfmadd.d D6, D6, VALPHA, U2 - xvfmadd.d D7, D7, VALPHA, U3 + xvld U4, C1, 0x00 + xvld U5, C1, 0x20 + xvld U6, C1, 0x40 + xvld U7, C1, 0x60 + xvfmadd.d D4, D4, VALPHA, U4 + xvfmadd.d D5, D5, VALPHA, U5 + xvfmadd.d D6, D6, VALPHA, U6 + xvfmadd.d D7, D7, VALPHA, U7 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2352,10 +2329,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmul.d D0, U0, U4 xvfmul.d D1, U1, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 - xvfmul.d D5, U1, U4 + xvfmul.d D4, U0, U5 + xvfmul.d D5, U1, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x40 @@ -2366,131 +2343,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M8_L7 */ beq ZERO,TL, .L_N3_M8_L7 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M8_TL1_END + .L_N3_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 - - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x10 + KERNEL8x8x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M8_TL1 +.L_N3_M8_TL1_END: + KERNEL8x8x2_END .L_N3_M8_L7: /* if (!(L & 7)) goto L_N3_M8_L0 */ @@ -2505,9 +2376,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, U0, U4, D0 xvfmadd.d D1, U1, U4, D1 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - xvfmadd.d D5, U1, U4, D5 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 + xvfmadd.d D5, U1, U5, D5 /* Add stride for A0, B0 */ addi.d A0, A0, 0x40 @@ -2530,10 +2401,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D1, D1, VALPHA, U1 /* Load C1 */ - xvld U0, C1, 0x00 - xvld U1, C1, 0x20 - xvfmadd.d D4, D4, VALPHA, U0 - xvfmadd.d D5, D5, VALPHA, U1 + xvld U2, C1, 0x00 + xvld U3, C1, 0x20 + xvfmadd.d D4, D4, VALPHA, U2 + xvfmadd.d D5, D5, VALPHA, U3 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2603,9 +2474,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* line 1 */ xvfmul.d D0, U0, U4 - xvldrepl.d U4, B0, 0x08 + xvldrepl.d U5, B0, 0x08 /* line 2 */ - xvfmul.d D4, U0, U4 + xvfmul.d D4, U0, U5 /* Add stride for A0 and B0 */ addi.d A0, A0, 0x20 @@ -2616,107 +2487,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M4_L7 */ beq ZERO,TL, .L_N3_M4_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M4_TL1_END + .L_N3_M4_TL1: /* TL-- */ - /***8-1***/ - /* Load 8 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - /* Cumulative D0~D15 */ - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x10 + KERNEL8x4x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M4_TL1 +.L_N3_M4_TL1_END: + KERNEL8x4x2_END .L_N3_M4_L7: /* if (!(L & 7)) goto L_N3_M4_L0 */ @@ -2729,8 +2517,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x20 @@ -2749,8 +2537,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) /* Store C0 */ @@ -2830,106 +2618,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M2_L7 */ beq ZERO,TL, .L_N3_M2_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M2_TL1_END + .L_N3_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x10 + KERNEL8x2x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M2_TL1 +.L_N3_M2_TL1_END: + KERNEL8x2x2_END .L_N3_M2_L7: /* if (!(L & 7)) goto L_N3_M2_L0 */ @@ -2942,8 +2648,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x10 @@ -2962,8 +2668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3043,106 +2749,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N3_M1_L7 */ beq ZERO,TL, .L_N3_M1_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + xvldrepl.d U13, B0, 0x08 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x10 + + beq ZERO, TL, .L_N3_M1_TL1_END + .L_N3_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x10 + KERNEL8x1x2 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N3_M1_TL1 +.L_N3_M1_TL1_END: + KERNEL8x1x2_END .L_N3_M1_L7: /* if (!(L & 7)) goto L_N3_M1_L0 */ @@ -3155,8 +2779,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvldrepl.d U4, B0, 0x00 xvfmadd.d D0, U0, U4, D0 - xvldrepl.d U4, B0, 0x08 - xvfmadd.d D4, U0, U4, D4 + xvldrepl.d U5, B0, 0x08 + xvfmadd.d D4, U0, U5, D4 /* Add stride for A0, B0 */ addi.d A0, A0, 0x08 @@ -3175,8 +2799,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvfmadd.d D0, D0, VALPHA, U0 /* D0 = U0 + (D0 * VALPHA) */ /* Load C1 */ - xvld U0, C1, 0x00 - xvfmadd.d D4, D4, VALPHA, U0 + xvld U1, C1, 0x00 + xvfmadd.d D4, D4, VALPHA, U1 #endif // #if defined(TRMMKERNEL) xvstelm.d D0, C0, 0x00, 0x00 @@ -3300,137 +2924,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_L7 */ beq ZERO,TL, .L_N1_L7 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + xvld U10, A0, 0x40 + xvld U11, A0, 0x60 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x80 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_TL1_END .L_N1_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-2***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-3***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-4***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-5***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-6***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-7***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 - - /***8-8***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - xvld U2, A0, 0x40 - xvld U3, A0, 0x60 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - xvfmadd.d D2, U2, U4, D2 - xvfmadd.d D3, U3, U4, D3 - - addi.d A0, A0, 0x80 - addi.d B0, B0, 0x08 + KERNEL8x16x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_TL1 +.L_N1_TL1_END: + KERNEL8x16x1_END .L_N1_L7: /* if (!(L & 7)) goto L_N1_L0 */ @@ -3556,99 +3068,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M8_L7 */ beq ZERO,TL, .L_N1_M8_L7 + xvld U8, A0, 0x00 + xvld U9, A0, 0x20 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x40 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M8_TL1_END .L_N1_M8_TL1: /* TL-- */ - /***8-1***/ - /* Load 16 * 64 from A0 */ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 - - /***8-8***/ - xvld U0, A0, 0x00 - xvld U1, A0, 0x20 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - xvfmadd.d D1, U1, U4, D1 - - addi.d A0, A0, 0x40 - addi.d B0, B0, 0x08 + KERNEL8x8x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_TL1_END: + KERNEL8x8x1_END + .L_N1_M8_L7: /* if (!(L & 7)) goto L_N1_M8_L0 */ andi TL, L, 7 @@ -3753,81 +3191,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M4_L7 */ beq ZERO,TL, .L_N1_M4_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x20 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M4_TL1_END + .L_N1_M4_TL1: /* TL-- */ - /***8-1***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x20 - addi.d B0, B0, 0x08 + KERNEL8x4x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_TL1_END: + KERNEL8x4x1_END .L_N1_M4_L7: /* if (!(L & 7)) goto L_N1_M4_L0 */ @@ -3927,82 +3307,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M2_L7 */ beq ZERO,TL, .L_N1_M2_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x10 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M2_TL1_END + .L_N1_M2_TL1: /* TL-- */ - /***8-1***/ - /* Load 2 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x10 - addi.d B0, B0, 0x08 + KERNEL8x2x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_TL1_END: + KERNEL8x2x1_END .L_N1_M2_L7: /* if (!(L & 7)) goto L_N1_M2_L0 */ @@ -4101,82 +3422,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* if (TL < 1) goto L_N1_M1_L7 */ beq ZERO,TL, .L_N1_M1_L7 + xvld U8, A0, 0x00 + + addi.d TL, TL, -1 + + xvldrepl.d U12, B0, 0x00 + addi.d A0, A0, 0x08 + addi.d B0, B0, 0x08 + + beq ZERO, TL, .L_N1_M1_TL1_END + .L_N1_M1_TL1: /* TL-- */ - /***8-1***/ - /* Load 1 * 64 from A0 */ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-2***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-3***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-4***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-5***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-6***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-7***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 - - /***8-8***/ - xvld U0, A0, 0x00 - - xvldrepl.d U4, B0, 0x00 - xvfmadd.d D0, U0, U4, D0 - - addi.d A0, A0, 0x08 - addi.d B0, B0, 0x08 + KERNEL8x1x1 addi.d TL, TL, -1 /* TL-- */ blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_TL1_END: + KERNEL8x1x1_END .L_N1_M1_L7: /* if (!(L & 7)) goto L_N1_M1_L0 */ @@ -4243,7 +3505,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. LDARG $r26, $sp, 24 LDARG $r27, $sp, 32 LD $f23, $sp, 40 - addi.d $sp, $sp, 56 + LD $f24, $sp, 48 + LD $f25, $sp, 56 + LD $f26, $sp, 64 + LD $f27, $sp, 72 + LD $f28, $sp, 80 + LD $f29, $sp, 88 + LD $f30, $sp, 96 + LD $f31, $sp, 104 + addi.d $sp, $sp, 120 jirl $r0, $r1, 0x0 diff --git a/kernel/loongarch64/dgemv_n_8_lasx.S b/kernel/loongarch64/dgemv_n_8_lasx.S new file mode 100644 index 000000000..940d27569 --- /dev/null +++ b/kernel/loongarch64/dgemv_n_8_lasx.S @@ -0,0 +1,546 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/14 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define Y_ORG $r15 +#define OFFSET $r16 +#define K_LDA $r17 +#define M8 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 + +#define VALPHA $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define Y0 $xr10 +#define Y1 $xr11 +#define A0 $xr12 +#define A1 $xr13 +#define A2 $xr14 +#define A3 $xr15 +#define A4 $xr16 +#define A5 $xr17 +#define A6 $xr18 +#define A7 $xr19 +#define A8 $xr20 +#define A9 $xr21 +#define A10 $xr22 +#define A11 $xr23 +#define A12 $xr24 +#define A13 $xr25 +#define A14 $xr26 +#define A15 $xr27 + +.macro DLOAD_X_8 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \ + X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2 + GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DLOAD_X_1 + GLDREPL xv, d, X0, X, 0x00 + GMUL xvf, d, X0, X0, VALPHA +.endm + +.macro DLOAD_Y_8 + GLD xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DLOAD_Y_4 + GLD xv, , Y0, Y, 0 +.endm + +.macro DLOAD_Y_1 + fld.d $f10, Y, 0 +.endm + +.macro DSTORE_Y_8 + GST xv, , Y0, Y, 0, Y1, Y, 0x20 +.endm + +.macro DSTORE_Y_4 + GST xv, , Y0, Y, 0 +.endm + +.macro DSTORE_Y_1 + fst.d $f10, Y, 0 +.endm + +// Unable to use vector load/store ins +.macro DLOAD_Y_8_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + PTR_ALSL T0, INC_Y, Y, 2 + fld.d $f11, T0, 0 + fldx.d $f17, T0, INC_Y + PTR_ADD T0, T0, INC_Y + PTR_ADD T0, T0, INC_Y + fld.d $f18, T0, 0 + fldx.d $f19, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3 +.endm + +.macro DLOAD_Y_4_GAP + fld.d $f10, Y, 0 + fldx.d $f13, Y, INC_Y + PTR_ALSL T0, INC_Y, Y, 1 + fld.d $f14, T0, 0 + fldx.d $f15, T0, INC_Y + GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3 +.endm + +.macro DSTORE_Y_8_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 + + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 0 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y1, T0, 0, 3 +.endm + +.macro DSTORE_Y_4_GAP + xvstelm.d Y0, Y, 0, 0 + PTR_ADD T0, Y, INC_Y + xvstelm.d Y0, T0, 0, 1 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 2 + PTR_ADD T0, T0, INC_Y + xvstelm.d Y0, T0, 0, 3 +.endm + +.macro DLOAD_X_8_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X4, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X5, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X6, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X7, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \ + X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA +.endm + +.macro DLOAD_X_4_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X2, T0, 0x00 + PTR_ADD T0, T0, INC_X + xvldrepl.d X3, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA +.endm + +.macro DLOAD_X_2_GAP + xvldrepl.d X0, X, 0x00 + PTR_ADD T0, X, INC_X + xvldrepl.d X1, T0, 0x00 + GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA +.endm + +.macro DGEMV_N_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1, \ + Y0, A8, X4, Y0, Y1, A9, X4, Y1, \ + Y0, A10, X5, Y0, Y1, A11, X5, Y1, \ + Y0, A12, X6, Y0, Y1, A13, X6, Y1, \ + Y0, A14, X7, Y0, Y1, A15, X7, Y1 +.endm + +.macro DGEMV_N_4x8 + GLD_INC xv, , 0x20, A0, PA0, 0, \ + A2, PA1, 0, \ + A4, PA2, 0, \ + A6, PA3, 0, \ + A8, PA4, 0, \ + A10, PA5, 0, \ + A12, PA6, 0, \ + A14, PA7, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, \ + Y0, A6, X3, Y0, \ + Y0, A8, X4, Y0, \ + Y0, A10, X5, Y0, \ + Y0, A12, X6, Y0, \ + Y0, A14, X7, Y0 +.endm + +.macro DGEMV_N_1x8 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \ + $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, \ + $f10, $f18, $f5, $f10, \ + $f10, $f20, $f6, $f10, \ + $f10, $f22, $f7, $f10, \ + $f10, $f24, $f8, $f10, \ + $f10, $f26, $f9, $f10, +.endm + +.macro DGEMV_N_8x4 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1, \ + Y0, A4, X2, Y0, Y1, A5, X2, Y1, \ + Y0, A6, X3, Y0, Y1, A7, X3, Y1 +.endm + +.macro DGEMV_N_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \ + Y0, A4, X2, Y0, Y0, A6, X3, Y0 +.endm + +.macro DGEMV_N_1x4 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0 + GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \ + $f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10 +.endm + +.macro DGEMV_N_8x2 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \ + Y0, A2, X1, Y0, Y1, A3, X1, Y1 +.endm + +.macro DGEMV_N_4x2 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + GMADD xvf, d, Y0, A0, X0, Y0, \ + Y0, A2, X1, Y0 +.endm + +.macro DGEMV_N_1x2 + GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0 + GMADD f, d, $f10, $f12, $f2, $f10, \ + $f10, $f14, $f3, $f10 +.endm + +.macro DGEMV_N_1x1 + fld.d $f12, PA0, 0 + PTR_ADDI PA0, PA0, 0x08 + fmadd.d $f10, $f12, $f2, $f10 +.endm + +.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + DLOAD_\X_8 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x8 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ALSL Y, INC_Y, Y, 3 + PTR_ADDI K, K, 8 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\Y_4 + DGEMV_N_4x8 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x8 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + PTR_ALSL X, INC_X, X, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + DLOAD_\X_4 + xor K, K, K + move Y, Y_ORG + + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x4 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\Y_4 + DGEMV_N_4x4 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_4_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x4 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + DLOAD_\X_2 + xor K, K, K + move Y, Y_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\Y_8 + DGEMV_N_8x2 + DSTORE_\Y_8 + PTR_ADDI I, I, -1 + PTR_ADDI K, K, 8 + PTR_ALSL Y, INC_Y, Y, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\Y_4 + DGEMV_N_4x2 + DSTORE_\Y_4 + PTR_ALSL Y, INC_Y, Y, 2 + PTR_ADDI K, K, 4 +.L_\XW\()_N_2_M_3: + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x2 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + PTR_ADD PA0, PA0, K_LDA + PTR_ADD PA1, PA1, K_LDA + PTR_ALSL X, INC_X, X, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + DLOAD_\X_1 + xor K, K, K + move Y, Y_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + DLOAD_\Y_1 + DGEMV_N_1x1 + DSTORE_\Y_1 + PTR_ADDI I, I, -1 + PTR_ADD Y, Y, INC_Y + PTR_ADDI K, K, 1 + bnez I, .L_\XW\()_N_1_M_L1 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 7, 24 + 4 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + PTR_SUB J, INC_Y, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */ + PTR_ALSL I, I, J, 1 + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move Y_ORG, Y + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0_0 - .L_GAP_TABLE + .hword .L_GAP_0_1 - .L_GAP_TABLE + .hword .L_GAP_1_0 - .L_GAP_TABLE + .hword .L_GAP_1_1 - .L_GAP_TABLE +.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */ + DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1 +.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */ + DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */ + DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1 +.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */ + DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1 +.L_END: + pop_if_used 17 + 7, 24 + 4 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dgemv_t_8_lasx.S b/kernel/loongarch64/dgemv_t_8_lasx.S new file mode 100644 index 000000000..be90cb1af --- /dev/null +++ b/kernel/loongarch64/dgemv_t_8_lasx.S @@ -0,0 +1,468 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/07/17 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +*********************************************************************/ + +/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, + * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) + */ +#define M $r4 +#define N $r5 +#define ALPHA $f0 +#define A $r7 +#define LDA $r8 +#define X $r9 +#define INC_X $r10 +#define Y $r11 +#define INC_Y $r6 + +#define J $r12 +#define I $r13 +#define K $r14 +#define PY0 $r14 +#define X_ORG $r15 +#define PY1 $r16 +#define K_LDA $r17 +#define PY2 $r18 +#define T0 $r19 +#define PA0 $r20 +#define PA1 $r23 +#define PA2 $r24 +#define PA3 $r25 +#define PA4 $r26 +#define PA5 $r27 +#define PA6 $r28 +#define PA7 $r29 +#define M8 $r30 + +#define VALPHA $xr0 +#define X0 $xr1 +#define X1 $xr2 +#define A0 $xr3 +#define A1 $xr4 +#define A2 $xr5 +#define A3 $xr6 +#define A4 $xr7 +#define A5 $xr8 +#define A6 $xr9 +#define A7 $xr10 +#define A8 $xr11 +#define A9 $xr12 +#define A10 $xr13 +#define A11 $xr14 +#define A12 $xr15 +#define A13 $xr16 +#define A14 $xr17 +#define A15 $xr18 +#define TP0 $xr19 +#define TP1 $xr20 +#define TP2 $xr21 +#define TP3 $xr22 +#define TP4 $xr23 +#define TP5 $xr24 +#define TP6 $xr25 +#define TP7 $xr26 +#define Y0 $xr3 +#define Y1 $xr4 +#define Y2 $xr5 +#define Y3 $xr6 +#define Y4 $xr7 +#define Y5 $xr8 +#define Y6 $xr9 +#define Y7 $xr10 + +.macro ZERO_Y8 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \ + TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7 +.endm + +.macro ZERO_Y4 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3 +.endm + +.macro ZERO_Y2 + GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1 +.endm + +.macro ZERO_Y1 + GXOR xv, v, TP0, TP0, TP0 +.endm + +.macro DLOAD_X8 + GLD xv, , X0, X, 0x00, X1, X, 0x20 +.endm + +.macro DLOAD_X4 + GLD xv, , X0, X, 0x00 +.endm + +.macro DLOAD_X8_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 + PTR_ALSL T0, INC_X, X, 2 + fld.d $f2, T0, 0x00 + fldx.d $f3, T0, INC_X + PTR_ALSL T0, INC_X, T0, 1 + fld.d $f4, T0, 0x00 + fldx.d $f5, T0, INC_X + GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3 +.endm + +.macro DLOAD_X4_GAP + fld.d $f1, X, 0x00 + fldx.d $f2, X, INC_X + PTR_ALSL T0, INC_X, X, 1 + fld.d $f3, T0, 0x00 + fldx.d $f4, T0, INC_X + GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3 +.endm + +.macro DGEMV_T_8x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0, \ + A8, PA4, 0, A9, PA4, 0, \ + A10, PA5, 0, A11, PA5, 0, \ + A12, PA6, 0, A13, PA6, 0, \ + A14, PA7, 0, A15, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3, \ + TP4, A8, X0, TP4, TP4, A9, X1, TP4, \ + TP5, A10, X0, TP5, TP5, A11, X1, TP5, \ + TP6, A12, X0, TP6, TP6, A13, X1, TP6, \ + TP7, A14, X0, TP7, TP7, A15, X1, TP7 +.endm + +.macro DGEMV_T_8x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \ + A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3, \ + TP4, A8, X0, TP4, TP5, A10, X0, TP5, \ + TP6, A12, X0, TP6, TP7, A14, X0, TP7, +.endm + +.macro DGEMV_T_4x8 + GLD_INC xv, , 0x20, \ + A0, PA0, 0, A1, PA0, 0, \ + A2, PA1, 0, A3, PA1, 0, \ + A4, PA2, 0, A5, PA2, 0, \ + A6, PA3, 0, A7, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1, \ + TP2, A4, X0, TP2, TP2, A5, X1, TP2, \ + TP3, A6, X0, TP3, TP3, A7, X1, TP3 +.endm + +.macro DGEMV_T_4x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \ + TP2, A4, X0, TP2, TP3, A6, X0, TP3 +.endm + +.macro DGEMV_T_2x8 + GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \ + TP1, A2, X0, TP1, TP1, A3, X1, TP1 +.endm + +.macro DGEMV_T_2x4 + GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0 + + GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1 +.endm + +.macro DGEMV_T XW:req X8:req, X4:req + PTR_SRLI J, N, 3 + beqz J, .L_\XW\()_N_7 + PTR_SLLI K_LDA, LDA, 3 + PTR_SUB K_LDA, K_LDA, M8 +.L_\XW\()_N_L8: + ZERO_Y8 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_M_7 +.align 5 +.L_\XW\()_M_L8: + DLOAD_\X8 + DGEMV_T_8x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_M_L8 +.L_\XW\()_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_M_3 + DLOAD_\X4 + DGEMV_T_8x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \ + Y5, TP5, Y6, TP6, Y7, TP7 + andi I, M, 3 + beqz I, .L_\XW\()_M_END +.align 5 +.L_\XW\()_M_L1: + fld.d $f1, X, 0x00 + fld.d $f11, PA0, 0x00 + fld.d $f12, PA1, 0x00 + fld.d $f13, PA2, 0x00 + fld.d $f14, PA3, 0x00 + fld.d $f15, PA4, 0x00 + fld.d $f16, PA5, 0x00 + fld.d $f17, PA6, 0x00 + fld.d $f18, PA7, 0x00 +#if __loongarch_grlen == 64 + GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#else + GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \ + PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08 +#endif + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \ + $f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_M_L1 +.L_\XW\()_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + PTR_ALSL PY1, INC_Y, Y, 2 + fld.d $f15, PY1, 0x00 + fldx.d $f16, PY1, INC_Y + PTR_ALSL PY2, INC_Y, PY1, 1 + fld.d $f17, PY2, 0x00 + fldx.d $f18, PY2, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \ + $f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18 + + PTR_ADDI J, J, -1 +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \ + PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + fst.d $f15, PY1, 0x00 + fstx.d $f16, PY1, INC_Y + fst.d $f17, PY2, 0x00 + fstx.d $f18, PY2, INC_Y + PTR_ALSL Y, INC_Y, Y, 3 + bnez J, .L_\XW\()_N_L8 +.L_\XW\()_N_7: + andi J, N, 4 + beqz J, .L_\XW\()_N_3 + ZERO_Y4 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_4_M_7 +.align 5 +.L_\XW\()_N_4_M_L8: + DLOAD_\X8 + DGEMV_T_4x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_4_M_L8 +.L_\XW\()_N_4_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_4_M_3 + DLOAD_\X4 + DGEMV_T_4x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_4_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3 + andi I, M, 3 + beqz I, .L_\XW\()_N_4_M_END +.align 5 +.L_\XW\()_N_4_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_4_M_L1 +.L_\XW\()_N_4_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + PTR_ALSL PY0, INC_Y, Y, 1 + fld.d $f13, PY0, 0x00 + fldx.d $f14, PY0, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14 + + PTR_SLLI K_LDA, LDA, 2 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + fst.d $f13, PY0, 0x00 + fstx.d $f14, PY0, INC_Y + PTR_ALSL Y, INC_Y, Y, 2 +.L_\XW\()_N_3: + andi J, N, 2 + beqz J, .L_\XW\()_N_1 + ZERO_Y2 + move X, X_ORG + PTR_SRLI I, M, 3 + beqz I, .L_\XW\()_N_2_M_7 +.align 5 +.L_\XW\()_N_2_M_L8: + DLOAD_\X8 + DGEMV_T_2x8 + PTR_ADDI I, I, -1 + PTR_ALSL X, INC_X, X, 3 + bnez I, .L_\XW\()_N_2_M_L8 +.L_\XW\()_N_2_M_7: + andi I, M, 4 + beqz I, .L_\XW\()_N_2_M_3 + DLOAD_\X4 + DGEMV_T_2x4 + PTR_ALSL X, INC_X, X, 2 +.L_\XW\()_N_2_M_3: + // Accumulated + GACC xvf, d, Y0, TP0, Y1, TP1 + andi I, M, 3 + beqz I, .L_\XW\()_N_2_M_END +.align 5 +.L_\XW\()_N_2_M_L1: + fld.d $f1, X, 0x00 + GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00 + GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + bnez I, .L_\XW\()_N_2_M_L1 +.L_\XW\()_N_2_M_END: + fld.d $f11, Y, 0x00 + fldx.d $f12, Y, INC_Y + + GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12 + + PTR_SLLI K_LDA, LDA, 1 + PTR_SUB K_LDA, K_LDA, M8 + +#if __loongarch_grlen == 64 + GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#else + GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA +#endif + fst.d $f11, Y, 0x00 + fstx.d $f12, Y, INC_Y + PTR_ALSL Y, INC_Y, Y, 1 +.L_\XW\()_N_1: + andi J, N, 1 + beqz J, .L_END + ZERO_Y1 + move X, X_ORG + move I, M + beqz I, .L_END +.align 5 +.L_\XW\()_N_1_M_L1: + fld.d $f3, PA0, 0x00 + fld.d $f1, X, 0x00 + fmadd.d $f19, $f3, $f1, $f19 + PTR_ADDI I, I, -1 + PTR_ADD X, X, INC_X + PTR_ADDI PA0, PA0, 0x08 + bnez I, .L_\XW\()_N_1_M_L1 + fld.d $f3, Y, 0x00 + fmadd.d $f3, ALPHA, $f19, $f3 + fst.d $f3, Y, 0x00 + b .L_END +.endm + + PROLOGUE + PTR_LD INC_Y, $sp, 0 + push_if_used 17 + 8, 24 + 3 + PTR_ADDI K, $r0, 0x01 + PTR_SUB I, INC_X, K + maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */ + GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3 + xvreplve0.d VALPHA, $xr0 + move X_ORG, X + move PA0, A +#if __loongarch_grlen == 64 + GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#else + GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \ + PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA +#endif + la.local T0, .L_GAP_TABLE + PTR_ALSL I, I, T0, 1 + ld.h K, I, 0 + PTR_ADD T0, T0, K + jirl $r0, T0, 0 +.L_GAP_TABLE: + .hword .L_GAP_0 - .L_GAP_TABLE + .hword .L_GAP_1 - .L_GAP_TABLE +.L_GAP_0: /* if (incx == 1) */ + DGEMV_T GAP_0, X8, X4 +.L_GAP_1: /* if (incx != 1) */ + DGEMV_T GAP_1, X8_GAP, X4_GAP +.L_END: + pop_if_used 17 + 8, 24 + 3 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/dnrm2.S b/kernel/loongarch64/dnrm2.S index ff937ae53..2160b93a6 100644 --- a/kernel/loongarch64/dnrm2.S +++ b/kernel/loongarch64/dnrm2.S @@ -70,7 +70,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 move XX, X NOP LD a1, X, 0 * SIZE diff --git a/kernel/loongarch64/loongarch64_asm.S b/kernel/loongarch64/loongarch64_asm.S new file mode 100644 index 000000000..89243c620 --- /dev/null +++ b/kernel/loongarch64/loongarch64_asm.S @@ -0,0 +1,407 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#if __loongarch_grlen == 64 +#define LA_REG int64_t +#define REG_SIZE 8 +#define REG_LOG 3 +#define PTR_ADDI addi.d +#define PTR_ADD add.d +#define PTR_SUB sub.d +#define PTR_LD ld.d +#define PTR_ST st.d +#define PTR_SLLI slli.d +#define PTR_SRLI srli.d +#define PTR_SRAI srai.d +#define PTR_MUL mul.d +#define PTR_ALSL alsl.d +#else +#define LA_REG int32_t +#define REG_SIZE 4 +#define REG_LOG 2 +#define PTR_ADDI addi.w +#define PTR_ADD add.w +#define PTR_SUB sub.w +#define PTR_LD ld.w +#define PTR_ST st.w +#define PTR_SLLI slli.w +#define PTR_SRLI srli.w +#define PTR_SRAI srai.w +#define PTR_MUL mul.w +#define PTR_ALSL alsl.w +#endif + +#if __loongarch_frlen == 64 +#define FREG_SIZE 8 +#define FREG_LOG 3 +#define PTR_FLD fld.d +#define PTR_FST fst.d +#else +#define FREG_SIZE 4 +#define FREG_LOG 2 +#define PTR_FLD fld.s +#define PTR_FST fst.s +#endif + +// The max registers available to the user which +// do not need to be preserved across calls. +// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html +#define MAX_INT_CALLER_SAVED 17 +#define MAX_FP_CALLER_SAVED 24 + +.altmacro // Enable alternate macro mode + +.macro push_if_used regs, fregs +.if \regs > MAX_INT_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG) + push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 +.endif +.if \fregs > MAX_FP_CALLER_SAVED + PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG) + push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 +.endif +.endm // End push_if_used +.macro pop_if_used regs, fregs +.if \fregs > MAX_FP_CALLER_SAVED + pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG +.endif +.if \regs > MAX_INT_CALLER_SAVED + pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1 + PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG +.endif +.endm // End pop_if_used +.macro push_regs from, to + PTR_ST $s\()\from, $sp, \from << REG_LOG +.if \to - \from + push_regs %from + 1, \to +.endif +.endm // End push_regs +.macro pop_regs from, to + PTR_LD $s\()\from, $sp, \from << REG_LOG +.if \to - \from + pop_regs %from + 1, \to +.endif +.endm // End pop_regs +.macro push_fregs from, to + PTR_FST $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + push_fregs %from + 1, \to +.endif +.endm // End push_fregs +.macro pop_fregs from, to + PTR_FLD $fs\()\from, $sp, \from << FREG_LOG +.if \to - \from + pop_fregs %from + 1, \to +.endif +.endm // End pop_fregs + +// +// Instruction Related Macros +// +// GLD +// +.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLD \pre_op, \suf_op, \more +.endif +.endm + +// +// GLD_INC +// +.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ld \out, \src, \offset +.else + \pre_op\()ld.\suf_op \out, \src, \offset +.endif + PTR_ADDI \src, \src, \inc +.ifnb \more + GLD_INC \pre_op, \suf_op, \inc, \more +.endif +.endm +// +// GLDX is same as GLD except the stride is a register +// +.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()ldx \out, \src, \offset +.else + \pre_op\()ldx.\suf_op \out, \src, \offset +.endif +.ifnb \more + GLDX \pre_op, \suf_op, \more +.endif +.endm +// +// GLDREPL +// +.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg + \pre_op\()ldrepl.\suf_op \out, \src, \offset +.ifnb \more + GLDREPL \pre_op, \suf_op, \more +.endif +.endm +// +// GST +// +.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg +.ifeqs "\suf_op", "0" + \pre_op\()st \src, \dst, \offset +.else + \pre_op\()st.\suf_op \src, \dst, \offset +.endif +.ifnb \more + GST \pre_op, \suf_op, \more +.endif +.endm +// +// GMUL +// +.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()mul.\suf_op \out, \in0, \in1 +.ifnb \more + GMUL \pre_op, \suf_op, \more +.endif +.endm +// +// GMADD +// +.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()madd.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GMADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADD +// +.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()add.\suf_op \out, \in0, \in1 +.ifnb \more + GADD \pre_op, \suf_op, \more +.endif +.endm +// +// GADDI +// +.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()addi.\suf_op \out, \in0, \in1 +.ifnb \more + GADDI \pre_op, \suf_op, \more +.endif +.endm +// +// GSUB +// +.macro GSUB pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()sub.\suf_op \out, \in0, \in1 +.ifnb \more + GSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GSLLI +// +.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()slli.\suf_op \out, \in0, \in1 +.ifnb \more + GSLLI \pre_op, \suf_op, \more +.endif +.endm +// +// GINSVE0 +// +.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()insve0.\suf_op \out, \in0, \in1 +.ifnb \more + GINSVE0 \pre_op, \suf_op, \more +.endif +.endm +// +// GXOR +// +.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()xor.\suf_op \out, \in0, \in1 +.ifnb \more + GXOR \pre_op, \suf_op, \more +.endif +.endm +// +// GPERMI +// +.macro GPERMI pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg + \pre_op\()permi.\suf_op \out, \in0, \in1 +.ifnb \more + GPERMI \pre_op, \suf_op, \more +.endif +.endm +// +// GNMSUB +// +.macro GNMSUB pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg + \pre_op\()nmsub.\suf_op \out, \in0, \in1, \in2 +.ifnb \more + GNMSUB \pre_op, \suf_op, \more +.endif +.endm +// +// GPRELD +// +.macro GPRELD in0:req, in1:req, in2:req, more:vararg + preld \in0, \in1, \in2 +.ifnb \more + GPRELD \more +.endif +.endm + +// +// Compound instructions +// +// GACC: Accumulate the values of vector registers +// +.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg +.ifeqs "\pre_op", "xvf" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "vf" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifeqs "\suf_op", "s" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif + +.ifeqs "\pre_op", "xv" + xvpermi.q \out, \in, 0x01 + \pre_op\()add.\suf_op \in, \out, \in + xvpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + xvpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + xvpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + xvpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifeqs "\pre_op", "v" + vpackod.d \out, \in, \in + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "d" + vpackod.w \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "w" + vpackod.h \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.ifnc "\suf_op", "h" + vpackod.b \in, \out, \out + \pre_op\()add.\suf_op \out, \out, \in +.endif +.endif +.endif +.endif + +.ifnb \more + GACC \pre_op, \suf_op, \more +.endif +.endm +// +// GMOV +// +.macro GMOV pre_op:req, out:req, in:req, more:vararg + \pre_op\()or.v \out, \in, \in +.ifnb \more + GMOV \pre_op, \more +.endif +.endm + +// +// Media Related Macros +// +.macro GSBUTTERFLY pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()ilvl.\suf_op \out0, \in0, \in1 + \pre_op\()ilvh.\suf_op \out1, \in0, \in1 +.endm +.macro GINTERLACE pre_op, suf_op, out0, out1, in0, in1 + \pre_op\()pickev.\suf_op \out0, \in0, \in1 + \pre_op\()pickod.\suf_op \out1, \in0, \in1 +.endm + +// +// TRANSPOSE4x4_D: Transpose 4x4 block with double-word elements in vectors, +// has no pre_op param. 128-bit vector instructions are not supported. +// +.macro GTRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ + vt0, vt1 + GSBUTTERFLY xv, d, \vt0, \out1, \in1, \in0 + GSBUTTERFLY xv, d, \vt1, \out3, \in3, \in2 + GMOV xv, \out0, \vt0, \out2, \vt1, \vt1, \out3 + GPERMI xv, q, \out0, \out2, 0x02, \out2, \vt0, 0x31, \out3, \out1, 0x31, \out1, \vt1, 0x02 +.endm + +.macro GTRANSPOSE8x8_W out0, out1, out2, out3, out4, out5, out6, out7, \ + in0, in1, in2, in3, in4, in5, in6, in7, \ + tmp0, tmp1, tmp2, tmp3 + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in2, \in0 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in3, \in1 + GSBUTTERFLY xv, w, \out0, \out1, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out2, \out3, \tmp3, \tmp2 + + GSBUTTERFLY xv, w, \tmp0, \tmp2, \in6, \in4 + GSBUTTERFLY xv, w, \tmp1, \tmp3, \in7, \in5 + GSBUTTERFLY xv, w, \out4, \out5, \tmp1, \tmp0 + GSBUTTERFLY xv, w, \out6, \out7, \tmp3, \tmp2 + + GMOV xv, \tmp0, \out0, \tmp1, \out1, \tmp2, \out2, \tmp3, \out3 + + GPERMI xv, q, \out0, \out4, 0x02, \out1, \out5, 0x02, \ + \out2, \out6, 0x02, \out3, \out7, 0x02, \ + \out4, \tmp0, 0x31, \out5, \tmp1, 0x31, \ + \out6, \tmp2, 0x31, \out7, \tmp3, 0x31 +.endm diff --git a/kernel/loongarch64/sgemm_kernel_16x8_lasx.S b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S new file mode 100644 index 000000000..254dbe052 --- /dev/null +++ b/kernel/loongarch64/sgemm_kernel_16x8_lasx.S @@ -0,0 +1,2325 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +* +* +* 2023/08/23 guxiwei +* Parameter: +* SGEMM_DEFAULT_UNROLL_N 8 +* SGEMM_DEFAULT_UNROLL_M 16 +* SGEMM_DEFAULT_P 256 +* SGEMM_DEFAULT_Q 256 +* SGEMM_DEFAULT_R 1024 +* A_PRE 1024 +* B_PRE 256 // Enable prefetching for B results in a performance decrease, temporarily disabled. +* +* +* Performance at Loongson 3A5000 2.5GHz with 5000x5000x5000: +* 1 thread: 71.7 GFLOPS +* 2 threads: 142.6 GFLOPS +* 3 threads: 211.5 GFLOPS +* 4 threads: 265.0 GFLOPS +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: bm +#define N $r5 // param 2: bn +#define K $r6 // param 3: bk +#define ALPHA $f0 // param 4: alpha +#define A $r7 // param 5: ba +#define B $r8 // param 6: bb +#define C $r9 // param 7: bc +#define LDC $r10 // param 8: ldc + +#ifdef TRMMKERNEL +#define OFFSET $r11 // param 9: offset +#endif +#define OFF $r12 + +/* Cycle control parameters */ +#define I $r13 +#define J $r14 +#define L $r15 +#define TL $r16 +/* Matrix address */ +#define A0 $r17 +#define B0 $r18 +#define C0 $r19 +#define C1 $r20 +#define C2 $r23 +#define C3 $r24 +#define C4 $r25 +#define C5 $r26 +#define C6 $r27 +#define C7 $r28 +#define T0 $r29 +#define T1 $r30 +#undef ZERO +#define ZERO $r0 + +/* LASX Vectors + * Store 16 sets of 32-bit data in A using UO and U1, with each register holding 8 data. + * Use X0 through X7 to store 8 sets of 32-bit data in B, with each register holding a broadcast value of a single data. + * Use D0 to D15 to store intermediate values of the computation. + * Use VALPHA to store the broadcast value of alpha + */ +#define U0 $xr0 +#define U1 $xr1 +#define X0 $xr2 +#define X1 $xr3 +#define X2 $xr4 +#define X3 $xr5 +#define X4 $xr6 +#define X5 $xr7 +#define X6 $xr8 +#define X7 $xr9 +#define D0 $xr10 +#define D1 $xr11 +#define D2 $xr12 +#define D3 $xr13 +#define D4 $xr14 +#define D5 $xr15 +#define D6 $xr16 +#define D7 $xr17 +#define D8 $xr18 +#define D9 $xr19 +#define D10 $xr20 +#define D11 $xr21 +#define D12 $xr22 +#define D13 $xr23 +#define D14 $xr24 +#define D15 $xr25 +#define VALPHA $xr26 + +/* Prefetch interval */ +#define A_PRE 0x400 +#define B_PRE 0x100 + +// Loops outline: +// .L_N8 <-------------------------------------------------------------------------------------------- /* if N >> 3 == 0, goto .L_N7; else, enter .L_N8. */ +// | .L_M16 <--------------------- | /* if M >> 4 == 0, goto .L_M8; Otherwise, enter .L_M16. */ +// | | .L_M16_TL1 | | +// | | .L_M16_L7 | The entire core loop of the function, KERNEK16x8 | +// | | .L_M16_L71 | | +// | | .L_M16_L0 ---------------- | +// | .L_M8 | +// | | .L_M8_TL1 | | +// | | .L_M8_L7 | KERNEK8x8 | +// | | .L_M8_L71 | | +// | | .L_M8_L0 | | +// | .L_M4 | +// | | .L_M4_TL1 | | +// | | .L_M4_L7 | KERNEK4x8 | +// | | .L_M4_L71 | | +// | | .L_M4_L0 | | +// | .L_M2 | +// | | .L_M2_TL1 | | +// | | .L_M2_L7 | KERNEK2x8 | +// | | .L_M2_L71 | | +// | | .L_M2_L0 | | +// | .L_M1 | +// | | .L_M1_TL1 | | +// | | .L_M1_L7 | KERNEK1x8 | +// | | .L_M1_L71 | | +// | | .L_M1_L0 | | +// | .L_M0------------------------------------------------------------------------------------------ +// .L_N7 /* if N & 7 == 0, goto .L_N0; else, enter .L_N4 */ +// .L_N4 +// | .L_N4_M16 <--------------------- +// | | .L_N4_M16_TL1 | +// | | .L_N4_M16_L7 | KERNEL16x4 +// | | .L_N4_M16_L71 | +// | | .L_N4_M16_L0 ---------------- +// | .L_N4_M8 +// | | .L_N4_M8_TL1 | +// | | .L_N4_M8_L7 | KERNEL8x4 +// | | .L_N4_M8_L71 | +// | | .L_N4_M8_L0 | +// | .L_N4_M4 +// | | .L_N4_M4_TL1 | +// | | .L_N4_M4_L7 | KERNEL4x4 +// | | .L_N4_M4_L71 | +// | | .L_N4_M4_L0 | +// | .L_N4_M2 +// | | .L_N4_M2_TL1 | +// | | .L_N4_M2_L7 | KERNEL2x4 +// | | .L_N4_M2_L71 | +// | | .L_N4_M2_L0 | +// | .L_N4_M1 +// | | .L_N4_M1_TL1 | +// | | .L_N4_M1_L7 | KERNEL1x4 +// | | .L_N4_M1_L71 | +// | | .L_N4_M1_L0 | +// | .L_N4_M0 +// .L_N3 /* if N & 2 == 0, goto .L_N1; else enter .L_N2 */ +// .L_N2 +// | .L_N2_M16 <--------------------- +// | | .L_N2_M16_TL1 | +// | | .L_N2_M16_L7 | KERNEL16x2 +// | | .L_N2_M16_L71 | +// | | .L_N2_M16_L0 ---------------- +// | .L_N2_M8 +// | | .L_N2_M8_TL1 | +// | | .L_N2_M8_L7 | KERNEL8x2 +// | | .L_N2_M8_L71 | +// | | .L_N2_M8_L0 | +// | .L_N2_M4 +// | | .L_N2_M4_TL1 | +// | | .L_N2_M4_L7 | KERNEL4x2 +// | | .L_N2_M4_L71 | +// | | .L_N2_M4_L0 | +// | .L_N2_M2 +// | | .L_N2_M2_TL1 | +// | | .L_N2_M2_L7 | KERNEL2x2 +// | | .L_N2_M2_L71 | +// | | .L_N2_M2_L0 | +// | .L_N2_M1 +// | | .L_N2_M1_TL1 | +// | | .L_N2_M1_L7 | KERNEL1x2 +// | | .L_N2_M1_L71 | +// | | .L_N2_M1_L0 | +// | .L_N2_M0 +// .L_N1 +// | .L_N1_M16 <--------------------- +// | | .L_N1_M16_TL1 | +// | | .L_N1_M16_L7 | KERNEL16x1 +// | | .L_N1_M16_L71 | +// | | .L_N1_M16_L0 ---------------- +// | .L_N1_M8 +// | | .L_N1_M8_TL1 | +// | | .L_N1_M8_L7 | KERNEL8x1 +// | | .L_N1_M8_L71 | +// | | .L_N1_M8_L0 | +// | .L_N1_M4 +// | | .L_N1_M4_TL1 | +// | | .L_N1_M4_L7 | KERNEL4x1 +// | | .L_N1_M4_L71 | +// | | .L_N1_M4_L0 | +// | .L_N1_M2 +// | | .L_N1_M2_TL1 | +// | | .L_N1_M2_L7 | KERNEL2x1 +// | | .L_N1_M2_L71 | +// | | .L_N1_M2_L0 | +// | .L_N1_M1 +// | | .L_N1_M1_TL1 | +// | | .L_N1_M1_L7 | KERNEL1x1 +// | | .L_N1_M1_L71 | +// | | .L_N1_M1_L0 | +// | .L_N1_M0 +// .L_N0 + +/*************** sgemm_kernel_macros ***************/ +.macro KERNEL1x16x8_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + preld 0, C0, 0x00 + GMUL xvf, s, D2, U0, X1, D3, U1, X1 + preld 0, C1, 0x00 + GMUL xvf, s, D4, U0, X2, D5, U1, X2 + preld 0, C2, 0x00 + GMUL xvf, s, D6, U0, X3, D7, U1, X3 + preld 0, C3, 0x00 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D9, U1, X4 + preld 0, C4, 0x00 + GMUL xvf, s, D10, U0, X5, D11, U1, X5 + preld 0, C5, 0x00 + GMUL xvf, s, D12, U0, X6, D13, U1, X6 + preld 0, C6, 0x00 + GMUL xvf, s, D14, U0, X7, D15, U1, X7 + preld 0, C7, 0x00 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1x16x8 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + preld 0, A0, A_PRE + GMADD xvf, s, D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D9, U1, X4, D9, \ + D10, U0, X5, D10, D11, U1, X5, D11 + //preld 0, B0, B_PRE + GMADD xvf, s, D12, U0, X6, D12, D13, U1, X6, D13, \ + D14, U0, X7, D14, D15, U1, X7 D15 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8x16x8 +.rept 8 + KERNEL1x16x8 +.endr +.endm + +.macro SAVE16x8 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA, \ + D8, D8, VALPHA, D9, D9, VALPHA, D10, D10, VALPHA, D11, D11, VALPHA, \ + D12, D12, VALPHA, D13, D13, VALPHA, D14, D14, VALPHA, D15, D15, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 + /* Load C4 */ + GLD xv, , X0, C4, 0x00, X1, C4, 0x20 + GMADD xvf, s, D8, D8, VALPHA, X0, D9, D9, VALPHA, X1 + /* Load C5 */ + GLD xv, , X2, C5, 0x00, X3, C5, 0x20 + GMADD xvf, s, D10, D10, VALPHA, X2, D11, D11, VALPHA, X3 + /* Load C6 */ + GLD xv, , X4, C6, 0x00, X5, C6, 0x20 + GMADD xvf, s, D12, D12, VALPHA, X4, D13, D13, VALPHA, X5 + /* Load C7 */ + GLD xv, , X6, C7, 0x00, X7, C7, 0x20 + GMADD xvf, s, D14, D14, VALPHA, X6, D15, D15, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20, \ + D8, C4, 0x00, D9, C4, 0x20, \ + D10, C5, 0x00, D11, C5, 0x20, \ + D12, C6, 0x00, D13, C6, 0x20, \ + D14, C7, 0x00, D15, C7, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40, \ + C4, C4, 0x40, C5, C5, 0x40, C6, C6, 0x40, C7, C7, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx8_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMUL xvf, s, D8, U0, X4, D10, U0, X5, \ + D12, U0, X6, D14, U0, X7 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL1xMx8 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + GLDREPL xv, w, X4, B0, 0x10, X5, B0, 0x14, X6, B0, 0x18, X7, B0, 0x1C + GMADD xvf, s, D8, U0, X4, D8, D10, U0, X5, D10, \ + D12, U0, X6, D12, D14, U0, X7, D14 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x20 +.endm + +.macro KERNEL8xMx8 m, stride +.rept 8 + KERNEL1xMx8 \m, \stride +.endr +.endm + +.macro SAVEMx8 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA, \ + D8, D8, VALPHA, D10, D10, VALPHA, \ + D12, D12, VALPHA, D14, D14, VALPHA +#else + /* Load C0, C1, C2, C3, C4, C5, C6, C7 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +.if \m == 8 + GLD xv, , X0, C4, 0x00, X2, C5, 0x00, X4, C6, 0x00, X6, C7, 0x00 +.elseif \m == 4 + GLD v, , $vr2, C4, 0x00, $vr4, C5, 0x00, $vr6, C6, 0x00, $vr8, C7, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C4, 0x00, $f4, C5, 0x00, $f6, C6, 0x00, $f8, C7, 0x00 +.endif + GMADD xvf, s, D8, D8, VALPHA, X0, D10, D10, VALPHA, X2, \ + D12, D12, VALPHA, X4, D14, D14, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00, \ + D8, C4, 0x00, D10, C5, 0x00, \ + D12, C6, 0x00, D14, C7, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00, \ + $vr18, C4, 0x00, $vr20, C5, 0x00, \ + $vr22, C6, 0x00, $vr24, C7, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00, \ + $f18, C4, 0x00, $f20, C5, 0x00, \ + $f22, C6, 0x00, $f24, C7, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride, \ + C4, C4, \stride, C5, C5, \stride, C6, C6, \stride, C7, C7, \stride +#endif +.endm + +.macro KERNEL1x16x4_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1, \ + D4, U0, X2, D5, U1, X2, \ + D6, U0, X3, D7, U1, X3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1x16x4 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3, \ + D4, U0, X2, D4, D5, U1, X2, D5, \ + D6, U0, X3, D6, D7, U1, X3 D7 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8x16x4 +.rept 8 + KERNEL1x16x4 +.endr +.endm + +.macro SAVE16x4 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA, \ + D4, D4, VALPHA, D5, D5, VALPHA, D6, D6, VALPHA, D7, D7, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 + /* Load C2 */ + GLD xv, , X4, C2, 0x00, X5, C2, 0x20 + GMADD xvf, s, D4, D4, VALPHA, X4, D5, D5, VALPHA, X5 + /* Load C3 */ + GLD xv, , X6, C3, 0x00, X7, C3, 0x20 + GMADD xvf, s, D6, D6, VALPHA, X6, D7, D7, VALPHA, X7 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20, \ + D4, C2, 0x00, D5, C2, 0x20, \ + D6, C3, 0x00, D7, C3, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40, C2, C2, 0x40, C3, C3, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx4_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMUL xvf, s, D0, U0, X0, D2, U0, X1, \ + D4, U0, X2, D6, U0, X3 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL1xMx4 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04, X2, B0, 0x08, X3, B0, 0x0C + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2, \ + D4, U0, X2, D4, D6, U0, X3, D6 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x10 +.endm + +.macro KERNEL8xMx4 m, stride +.rept 8 + KERNEL1xMx4 \m, \stride +.endr +.endm + +.macro SAVEMx4 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA, \ + D4, D4, VALPHA, D6, D6, VALPHA +#else + /* Load C0, C1, C2, C3 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00, X4, C2, 0x00, X6, C3, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00, $vr6, C2, 0x00, $vr8, C3, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00, $f6, C2, 0x00, $f8, C3, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2, \ + D4, D4, VALPHA, X4, D6, D6, VALPHA, X6 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00, \ + D4, C2, 0x00, D6, C3, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00, \ + $vr14, C2, 0x00, $vr16, C3, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00, \ + $f14, C2, 0x00, $f16, C3, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride, C2, C2, \stride, C3, C3, \stride +#endif +.endm + +.macro KERNEL1x16x2_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D1, U1, X0, \ + D2, U0, X1, D3, U1, X1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1x16x2 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1, \ + D2, U0, X1, D2, D3, U1, X1, D3 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8x16x2 +.rept 8 + KERNEL1x16x2 +.endr +.endm + +.macro SAVE16x2 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA, D2, D2, VALPHA, D3, D3, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 + /* Load C1 */ + GLD xv, , X2, C1, 0x00, X3, C1, 0x20 + GMADD xvf, s, D2, D2, VALPHA, X2, D3, D3, VALPHA, X3 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20, \ + D2, C1, 0x00, D3, C1, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40, C1, C1, 0x40 +#else + GADDI , w, C0, C0, 0x40, C1, C1, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx2_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMUL xvf, s, D0, U0, X0, D2, U0, X1 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL1xMx2 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00, X1, B0, 0x04 + GMADD xvf, s, D0, U0, X0, D0, D2, U0, X1, D2 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x08 +.endm + +.macro KERNEL8xMx2 m, stride +.rept 8 + KERNEL1xMx2 \m, \stride +.endr +.endm + +.macro SAVEMx2 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D2, D2, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00, X2, C1, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00, $vr4, C1, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00, $f4, C1, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00, $f4, C1, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0, D2, D2, VALPHA, X2 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00, D2, C1, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00, $vr12, C1, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00, $f12, C1, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00, $f12, C1, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride, C1, C1, \stride +#else + GADDI , w, C0, C0, \stride, C1, C1, \stride +#endif +.endm + +.macro KERNEL1x16x1_START + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0, D1, U1, X0 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1x16x1 + GLD xv, , U0, A0, 0x00, U1, A0, 0x20 + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0, D1, U1, X0, D1 + PTR_ADDI A0, A0, 0x40 + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8x16x1 +.rept 8 + KERNEL1x16x1 +.endr +.endm + +.macro SAVE16x1 +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA, D1, D1, VALPHA +#else + /* Load C0 */ + GLD xv, , X0, C0, 0x00, X1, C0, 0x20 + GMADD xvf, s, D0, D0, VALPHA, X0, D1, D1, VALPHA, X1 +#endif // #if defined(TRMMKERNEL) + GST xv, , D0, C0, 0x00, D1, C0, 0x20 +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, 0x40 +#else + GADDI , w, C0, C0, 0x40 +#endif +.endm + +// m = 8, 4, 2, 1 +// stride = 0x20, 0x10, 0x08, 0x04 +.macro KERNEL1xMx1_START m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMUL xvf, s, D0, U0, X0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL1xMx1 m, stride +.if \m == 8 + GLD xv, , U0, A0, 0x00 +.elseif \m == 4 + GLD v, , $vr0, A0, 0x00 +.elseif \m ==2 + GLD f, d, $f0, A0, 0x00 +.elseif \m ==1 + GLD f, s, $f0, A0, 0x00 +.endif + GLDREPL xv, w, X0, B0, 0x00 + GMADD xvf, s, D0, U0, X0, D0 + PTR_ADDI A0, A0, \stride + PTR_ADDI B0, B0, 0x04 +.endm + +.macro KERNEL8xMx1 m, stride +.rept 8 + KERNEL1xMx1 \m, \stride +.endr +.endm + +.macro SAVEMx1 m, stride +#if defined(TRMMKERNEL) + GMUL xvf, s, D0, D0, VALPHA +#else + /* Load C0, C1 */ + .if \m == 8 + GLD xv, , X0, C0, 0x00 + .elseif \m == 4 + GLD v, , $vr2, C0, 0x00 +.elseif \m == 2 + GLD f, d, $f2, C0, 0x00 +.elseif \m == 1 + GLD f, s, $f2, C0, 0x00 + .endif + GMADD xvf, s, D0, D0, VALPHA, X0 +#endif // #if defined(TRMMKERNEL) +.if \m == 8 + GST xv, , D0, C0, 0x00 +.elseif \m == 4 + GST v, , $vr10, C0, 0x00 +.elseif \m == 2 + GST f, d, $f10, C0, 0x00 +.elseif \m == 1 + GST f, s, $f10, C0, 0x00 +.endif +#if __loongarch_grlen == 64 + GADDI , d, C0, C0, \stride +#else + GADDI , w, C0, C0, \stride +#endif +.endm + + PROLOGUE + push_if_used 26, 32 + xvreplve0.w VALPHA, $xr0 +#if defined (TRMMKERNEL) && !defined(LEFT) + PTR_SUB OFF, ZERO, OFFSET +#else + xor OFF, OFF, OFF +#endif + /* if (!(N >> 3)) goto L_N7 */ + PTR_SRAI J, N, 3 /* J = bn >> 3 */ + andi N, N, 0x07 + beq ZERO, J, .L_N7 +.L_N8: /* J -- */ + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADDI J, J, -1 /* J-- */ +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0, C4, C3, T0, C5, C4, T0, \ + C6, C5, T0, C7, C6, T0 +#endif +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_M8 +.align 5 +.L_M16: /* I-- */ +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x8_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M16_L7 */ + beq ZERO,TL, .L_M16_L7 +.align 5 +.L_M16_TL1: + KERNEL8x16x8 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M16_TL1 +.L_M16_L7: + andi TL, L, 7 + beq TL, ZERO,.L_M16_L0 +.align 5 +.L_M16_L71: + KERNEL1x16x8 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M16_L71 +.L_M16_L0: + SAVE16x8 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -16 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 /* number of values in A */ +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_M16 +.L_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_M0 + + andi I, M, 8 + beq ZERO,I, .L_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 /* A0 += 8 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif // #if defined(TRMMKERNEL) + KERNEL1xMx8_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M8_L7 */ + beq ZERO,TL, .L_M8_L7 +.align 5 +.L_M8_TL1: + KERNEL8xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M8_TL1 +.L_M8_L7: + /* if (!(L & 7)) goto L_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M8_L0 +.align 5 +.L_M8_L71: + KERNEL1xMx8 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M8_L71 +.L_M8_L0: + SAVEMx8 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -8 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M4: + andi I, M, 4 + beq ZERO,I, .L_M2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 /* A0 += 4 * OFF */ + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 /* B0 = B + 8 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M4_L7 */ + beq ZERO,TL, .L_M4_L7 +.align 5 +.L_M4_TL1: + KERNEL8xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_TL1 +.L_M4_L7: + /* if (!(L & 7)) goto L_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M4_L0 +.L_M4_L71: + KERNEL1xMx8 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M4_L71 +.L_M4_L0: + SAVEMx8 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -4 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M2: + andi I, M, 2 + beq ZERO,I, .L_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 2, 0x08 + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M2_L7 */ + beq ZERO,TL, .L_M2_L7 +.align 5 +.L_M2_TL1: + KERNEL8xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M2_TL1 +.L_M2_L7: + /* if (!(L & 7)) goto L_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M2_L0 +.align 5 +.L_M2_L71: + KERNEL1xMx8 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M2_L71 +.L_M2_L0: + SAVEMx8 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -2 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_M1: + andi I, M, 1 + beq ZERO,I, .L_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x05 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 8 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx8_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_M1_L7 */ + beq ZERO,TL, .L_M1_L7 +.align 5 +.L_M1_TL1: + KERNEL8xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_M1_TL1 +.L_M1_L7: + /* if (!(L & 7)) goto L_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_M1_L0 +.align 5 +.L_M1_L71: + KERNEL1xMx8 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_M1_L71 +.L_M1_L0: + SAVEMx8 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + /* number of values in A */ + PTR_ADDI L, L, -1 +#else + /* number of values in B */ + PTR_ADDI L, L, -8 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x05 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + /* number of values in A */ + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_M0: + /* Add stride for B and C + * B += (K * 32) + * C += (LDC * 32) + */ + PTR_SLLI T0, K, 5 + PTR_SLLI T1, LDC, 5 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x08 /* number of values in B */ +#endif + blt ZERO, J, .L_N8 + +.L_N7: + andi J, N, 4 + beq ZERO, J, .L_N3 +.L_N4: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 +#if __loongarch_grlen == 64 + GADD , d, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#else + GADD , w, C1, C0, T0, C2, C1, T0, C3, C2, T0 +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N4_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N4_M8 +.align 5 +.L_N4_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 /* A0 += 16 * OFF */ + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 /* B0 += 4 * OFF */ +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x4_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_L7 */ + beq ZERO,TL, .L_N4_M16_L7 +.align 5 +.L_N4_M16_TL1: /* TL-- */ + KERNEL8x16x4 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M16_TL1 +.L_N4_M16_L7: + /* if (!(L & 7)) goto L_N4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M16_L0 +.align 5 +.L_N4_M16_L71: + KERNEL1x16x4 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M16_L71 +.L_N4_M16_L0: + SAVE16x4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N4_M16 +.L_N4_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N4_M0 + + andi I, M, 8 + beq ZERO,I, .L_N4_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M8_L7 */ + beq ZERO,TL, .L_N4_M8_L7 +.align 5 +.L_N4_M8_TL1: /* TL-- */ + KERNEL8xMx4 8, 0x20 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M8_TL1 +.L_N4_M8_L7: + /* if (!(L & 7)) goto L_N4_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M8_L0 +.align 5 +.L_N4_M8_L71: + KERNEL1xMx4 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M8_L71 +.L_N4_M8_L0: + SAVEMx4 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M4: + andi I, M, 4 + beq ZERO,I, .L_N4_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M4_L7 */ + beq ZERO,TL, .L_N4_M4_L7 +.align 5 +.L_N4_M4_TL1: /* TL-- */ + KERNEL8xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M4_TL1 +.L_N4_M4_L7: + /* if (!(L & 7)) goto L_N4_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M4_L0 +.align 5 +.L_N4_M4_L71: + KERNEL1xMx4 4, 0x10 + + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M4_L71 +.L_N4_M4_L0: + SAVEMx4 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M2: + andi I, M, 2 + beq ZERO,I, .L_N4_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M2_L7 */ + beq ZERO,TL, .L_N4_M2_L7 +.align 5 +.L_N4_M2_TL1: /* TL-- */ + KERNEL8xMx4 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M2_TL1 +.L_N4_M2_L7: + /* if (!(L & 7)) goto L_N4_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M2_L0 +.align 5 +.L_N4_M2_L71: + KERNEL1xMx4 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M2_L71 +.L_N4_M2_L0: + SAVEMx4 2, 0x08 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M1: + andi I, M, 1 + beq ZERO,I, .L_N4_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x04 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 4 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx4_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N4_M1_L7 */ + beq ZERO,TL, .L_N4_M1_L7 +.align 5 +.L_N4_M1_TL1: /* TL-- */ + KERNEL8xMx4 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N4_M1_TL1 +.L_N4_M1_L7: + /* if (!(L & 7)) goto L_N4_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N4_M1_L0 +.align 5 +.L_N4_M1_L71: + KERNEL1xMx4 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N4_M1_L71 +.L_N4_M1_L0: + SAVEMx4 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -4 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x04 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N4_M0: + /* Add stride for B and C + * B += 4 * K + * C += 4 * LDC + */ + PTR_SLLI T0, K, 4 + PTR_SLLI T1, LDC, 4 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 + +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x04 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N3: + andi J, N, 2 + beq ZERO, J, .L_N1 + +.L_N2: + move C0, C + move A0, A + PTR_SLLI T0, LDC, 2 + PTR_ADD C1, C0, T0 + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + + /* if (!(M >> 4)) goto L_N2_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N2_M8 +.align 5 +.L_N2_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x2_START + + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M16_L7 */ + beq ZERO,TL, .L_N2_M16_L7 +.align 5 +.L_N2_M16_TL1: /* TL-- */ + KERNEL8x16x2 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M16_TL1 +.L_N2_M16_L7: + /* if (!(L & 7)) goto L_N2_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M16_L0 +.align 5 +.L_N2_M16_L71: + KERNEL1x16x2 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M16_L71 +.L_N2_M16_L0: + SAVE16x2 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N2_M16 +.L_N2_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N2_M0 + + andi I, M, 8 + beq ZERO,I, .L_N2_M4 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M8_L7 */ + beq ZERO,TL, .L_N2_M8_L7 +.align 5 +.L_N2_M8_TL1: /* TL-- */ + KERNEL8xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M8_TL1 +.L_N2_M8_L7: + /* if (!(L & 7)) goto L_N2_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M8_L0 +.align 5 +.L_N2_M8_L71: + KERNEL1xMx2 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M8_L71 +.L_N2_M8_L0: + SAVEMx2 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M4: + andi I, M, 4 + beq ZERO,I, .L_N2_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M4_L7 */ + beq ZERO,TL, .L_N2_M4_L7 +.align 5 +.L_N2_M4_TL1: /* TL-- */ + KERNEL8xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M4_TL1 +.L_N2_M4_L7: + /* if (!(L & 7)) goto L_N2_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M4_L0 +.align 5 +.L_N2_M4_L71: + KERNEL1xMx2 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M4_L71 +.L_N2_M4_L0: + SAVEMx2 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M2: + andi I, M, 2 + beq ZERO,I, .L_N2_M1 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M2_L7 */ + beq ZERO,TL, .L_N2_M2_L7 +.align 5 +.L_N2_M2_TL1: /* TL-- */ + KERNEL8xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M2_TL1 +.L_N2_M2_L7: + /* if (!(L & 7)) goto L_N2_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M2_L0 +.align 5 +.L_N2_M2_L71: + KERNEL1xMx2 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M2_L71 +.L_N2_M2_L0: + SAVEMx2 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M1: + andi I, M, 1 + beq ZERO,I, .L_N2_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x03 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 2 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx2_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N2_M1_L7 */ + beq ZERO,TL, .L_N2_M1_L7 +.align 5 +.L_N2_M1_TL1: /* TL-- */ + KERNEL8xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N2_M1_TL1 +.L_N2_M1_L7: + /* if (!(L & 7)) goto L_N2_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N2_M1_L0 +.align 5 +.L_N2_M1_L71: + KERNEL1xMx2 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N2_M1_L71 +.L_N2_M1_L0: + SAVEMx2 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -2 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x03 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N2_M0: + /* Add stride for B and C + * B += 2 * K + * C += 2 * LDC + */ + PTR_SLLI T0, K, 3 + PTR_SLLI T1, LDC, 3 + PTR_ADD B, B, T0 + PTR_ADD C, C, T1 +#if defined(TRMMKERNEL) && !defined(LEFT) + PTR_ADDI OFF, OFF, 0x02 +#endif + /* We must reinit I */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ +.L_N1: + andi J, N, 1 + beq ZERO, J, .L_N0 + move C0, C + move A0, A + +#if defined(TRMMKERNEL) && defined(LEFT) + move OFF, OFFSET +#endif + /* if (!(M >> 4)) goto L_N1_M8 */ + PTR_SRAI I, M, 4 /* I = bm >> 4 */ + beq ZERO, I, .L_N1_M8 +.L_N1_M16: +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 16 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1x16x1_START + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M16_L7 */ + beq ZERO,TL, .L_N1_M16_L7 +.align 5 +.L_N1_M16_TL1: /* TL-- */ + KERNEL8x16x1 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M16_TL1 +.L_N1_M16_L7: + /* if (!(L & 7)) goto L_N1_M16_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M16_L0 +.align 5 +.L_N1_M16_L71: + KERNEL1x16x1 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M16_L71 +.L_N1_M16_L0: + SAVE16x1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -16 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x06 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x10 +#endif +#endif // #if defined(TRMMKERNEL) + + PTR_ADDI I, I, -1 /* I-- */ + blt ZERO,I, .L_N1_M16 +.L_N1_M8: + /* We have done M & 16, considering M=8/4/2/1 */ + andi I, M, 15 + beq ZERO,I, .L_N1_M0 + + andi I, M, 8 + beq ZERO,I, .L_N1_M4 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 8 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 8, 0x20 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M8_L7 */ + beq ZERO,TL, .L_N1_M8_L7 +.align 5 +.L_N1_M8_TL1: /* TL-- */ + KERNEL8xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M8_TL1 +.L_N1_M8_L7: + /* if (!(L & 7)) goto L_N1_M8_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M8_L0 +.align 5 +.L_N1_M8_L71: + KERNEL1xMx1 8, 0x20 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M8_L71 +.L_N1_M8_L0: + SAVEMx1 8, 0x20 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -8 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x05 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x08 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M4: + andi I, M, 4 + beq ZERO,I, .L_N1_M2 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 4 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 4, 0x10 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M4_L7 */ + beq ZERO,TL, .L_N1_M4_L7 +.align 5 +.L_N1_M4_TL1: /* TL-- */ + KERNEL8xMx1 4, 0x10 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M4_TL1 +.L_N1_M4_L7: + /* if (!(L & 7)) goto L_N1_M4_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M4_L0 +.align 5 +.L_N1_M4_L71: + KERNEL1xMx1 4, 0x10 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M4_L71 +.L_N1_M4_L0: + SAVEMx1 4, 0x10 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -4 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x04 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x04 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M2: + andi I, M, 2 + beq ZERO,I, .L_N1_M1 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, OFF, 0x02 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 2 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 2, 0x08 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M2_L7 */ + beq ZERO,TL, .L_N1_M2_L7 +.align 5 +.L_N1_M2_TL1: /* TL-- */ + KERNEL8xMx1 2, 0x08 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M2_TL1 +.L_N1_M2_L7: + /* if (!(L & 7)) goto L_N1_M2_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M2_L0 +.align 5 +.L_N1_M2_L71: + KERNEL1xMx1 2, 0x08 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M2_L71 +.L_N1_M2_L0: + SAVEMx1 2, 0x08 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -2 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x03 + PTR_ADD A0, A0, T0 + PTR_SLLI T0, L, 0x02 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x02 +#endif +#endif // #if defined(TRMMKERNEL) + +.L_N1_M1: + andi I, M, 1 + beq ZERO,I, .L_N1_M0 + +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + move B0, B +#else + PTR_SLLI T0, OFF, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B, T0 +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + PTR_SUB L, K, OFF +#elif defined(LEFT) + /* number of values in A */ + PTR_ADDI L, OFF, 1 +#else + /* number of values in B */ + PTR_ADDI L, OFF, 1 +#endif +#else // #if !defined(TRMMKERNEL) + move B0, B + move L, K /* L = bk */ +#endif + KERNEL1xMx1_START 1, 0x04 + /* Reduce L */ + PTR_ADDI L, L, -1 + PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */ + /* if (TL < 1) goto L_N1_M1_L7 */ + beq ZERO,TL, .L_N1_M1_L7 +.align 5 +.L_N1_M1_TL1: /* TL-- */ + KERNEL8xMx1 1, 0x04 + + PTR_ADDI TL, TL, -1 /* TL-- */ + blt ZERO,TL, .L_N1_M1_TL1 +.L_N1_M1_L7: + /* if (!(L & 7)) goto L_N1_M1_L0 */ + andi TL, L, 7 + beq TL, ZERO,.L_N1_M1_L0 +.align 5 +.L_N1_M1_L71: + KERNEL1xMx1 1, 0x04 + PTR_ADDI TL, TL, -1 + blt ZERO,TL, .L_N1_M1_L71 +.L_N1_M1_L0: + SAVEMx1 1, 0x04 +#if defined(TRMMKERNEL) +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + PTR_SUB L, K, OFF +#ifdef LEFT + PTR_ADDI L, L, -1 +#else + PTR_ADDI L, L, -1 +#endif + PTR_SLLI T0, L, 0x02 + PTR_ADD A0, A0, T0 + PTR_ADD B0, B0, T0 +#endif + +#ifdef LEFT + PTR_ADDI OFF, OFF, 0x01 +#endif +#endif // #if defined(TRMMKERNEL) +.L_N1_M0: +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_16_lasx.S b/kernel/loongarch64/sgemm_ncopy_16_lasx.S new file mode 100644 index 000000000..266c07c5c --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_16_lasx.S @@ -0,0 +1,463 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define S9 $r20 +#define S10 $r23 +#define S11 $r24 +#define S12 $r25 +#define S13 $r26 +#define S14 $r27 +#define S15 $r28 +#define S16 $r29 +#define TD $r30 +#define TS $r31 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define U8 $xr8 +#define U9 $xr9 +#define U10 $xr10 +#define U11 $xr11 +#define U12 $xr12 +#define U13 $xr13 +#define U14 $xr14 +#define U15 $xr15 +#define D0 $xr16 +#define D1 $xr17 +#define D2 $xr18 +#define D3 $xr19 +#define D4 $xr20 +#define D5 $xr21 +#define D6 $xr22 +#define D7 $xr23 +#define D8 $xr24 +#define D9 $xr25 +#define D10 $xr26 +#define D11 $xr27 +#define D12 $xr28 +#define D13 $xr29 +#define D14 $xr30 +#define D15 $xr31 + +// Loops outline +//.L_N16 <------------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0: --------------- +//.L_N15: +//.L_N8: +//| .L_N8_M8: +//| .L_N8_M7: +//| .L_N8_M1: +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 26, 32 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x04 + beq J, ZERO, .L_N15 +.align 5 +.L_N16: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S3, TL + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S9, S7, T0 + PTR_ADD S10, S8, T0 + PTR_ADD S11, S9, T0 + PTR_ADD S12, S10, T0 + PTR_ADD S13, S11, T0 + PTR_ADD S14, S12, T0 + PTR_ADD S15, S13, T0 + PTR_ADD S16, S14, T0 + PTR_ADD TS, S15, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + xvld U8, S9, 0x00 + xvld U9, S10, 0x00 + xvld U10, S11, 0x00 + xvld U11, S12, 0x00 + xvld U12, S13, 0x00 + xvld U13, S14, 0x00 + xvld U14, S15, 0x00 + xvld U15, S16, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GTRANSPOSE8x8_W D1, D3, D5, D7, D9, D11, D13, D15, \ + U8, U9, U10, U11, U12, U13, U14, U15, \ + U0, U1, U2, U3 // As tmp + GST xv, , D0, TD, 0x00, D1, TD, 0x20, D2, TD, 0x40, D3, TD, 0x60, \ + D4, TD, 0x80, D5, TD, 0xA0, D6, TD, 0xC0, D7, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + GST xv, , D8, TD, 0x00, D9, TD, 0x20, D10, TD, 0x40, D11, TD, 0x60, \ + D12, TD, 0x80, D13, TD, 0xA0, D14, TD, 0xC0, D15, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI S9, S9, 0x20 + PTR_ADDI S10, S10, 0x20 + PTR_ADDI S11, S11, 0x20 + PTR_ADDI S12, S12, 0x20 + PTR_ADDI S13, S13, 0x20 + PTR_ADDI S14, S14, 0x20 + PTR_ADDI S15, S15, 0x20 + PTR_ADDI S16, S16, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI TD, TD, 0x20 + + fld.s F0, S9, 0x00 + fld.s F1, S10, 0x00 + fld.s F2, S11, 0x00 + fld.s F3, S12, 0x00 + fld.s F4, S13, 0x00 + fld.s F5, S14, 0x00 + fld.s F6, S15, 0x00 + fld.s F7, S16, 0x00 + + fst.s F0, TD, 0x00 + fst.s F1, TD, 0x04 + fst.s F2, TD, 0x08 + fst.s F3, TD, 0x0C + fst.s F4, TD, 0x10 + fst.s F5, TD, 0x14 + fst.s F6, TD, 0x18 + fst.s F7, TD, 0x1C + + PTR_ADDI S9, S9, 0x04 + PTR_ADDI S10, S10, 0x04 + PTR_ADDI S11, S11, 0x04 + PTR_ADDI S12, S12, 0x04 + PTR_ADDI S13, S13, 0x04 + PTR_ADDI S14, S14, 0x04 + PTR_ADDI S15, S15, 0x04 + PTR_ADDI S16, S16, 0x04 + PTR_ADDI TD, TD, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N16 +.L_N15: + andi J, N, 0x0f + beq ZERO, J, .L_N0 + + andi J, N, 0x08 + beq ZERO, J, .L_N7 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_N8_M7 +.align 5 +.L_N8_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M8 +.L_N8_M7: + andi I, M, 0x07 + beq I, ZERO, .L_N7 +.align 5 +.L_N8_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N8_M1 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 26, 32 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_ncopy_8_lasx.S b/kernel/loongarch64/sgemm_ncopy_8_lasx.S new file mode 100644 index 000000000..5c173568b --- /dev/null +++ b/kernel/loongarch64/sgemm_ncopy_8_lasx.S @@ -0,0 +1,298 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define TD $r20 +#define TS $r11 +#define TL $r7 +#define T0 $r6 +#undef ZERO +#define ZERO $r0 + +#define F0 $f0 +#define F1 $f1 +#define F2 $f2 +#define F3 $f3 +#define F4 $f4 +#define F5 $f5 +#define F6 $f6 +#define F7 $f7 +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 +#define D0 $xr8 +#define D1 $xr9 +#define D2 $xr10 +#define D3 $xr11 +#define D4 $xr12 +#define D5 $xr13 +#define D6 $xr14 +#define D7 $xr15 +#define D8 $xr16 +#define D10 $xr17 +#define D12 $xr18 +#define D14 $xr19 + +// Loops outline +//.L_N8: <---------------- +//| .L_M8: | +//| .L_M7: | Main Loop +//| .L_M1: | +//| .L_M0:-------------- +//.L_N7: +//.L_N4: +//| .L_N4_M4: +//| .L_N4_M3: +//| .L_N4_M1: +//.L_N3: +//.L_N2: +//| .L_N2_M2: +//| .L_N2_M1: +//.L_N1: +//| .L_N1_M1: +//.L_N0 + + PROLOGUE + push_if_used 17, 20 + + move TD, DST + move TS, SRC + PTR_SLLI TL, LDA, 0x02 + PTR_SLLI T0, TL, 0x01 + PTR_SRAI J, N, 0x03 + beq J, ZERO, .L_N7 +.align 5 +.L_N8: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x03 + PTR_ADD S3, S2, TL + PTR_ADDI J, J, -1 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD TS, S7, T0 + beq I, ZERO, .L_M7 +.align 5 +.L_M8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GTRANSPOSE8x8_W D0, D2, D4, D6, D8, D10, D12, D14, \ + U0, U1, U2, U3, U4, U5, U6, U7, \ + D1, D3, D5, D7 // As tmp + GST xv, , D0, TD, 0x00, D2, TD, 0x20, D4, TD, 0x40, D6, TD, 0x60, \ + D8, TD, 0x80, D10, TD, 0xA0, D12, TD, 0xC0, D14, TD, 0xE0 + PTR_ADDI TD, TD, 0x100 + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M8 +.L_M7: + andi I, M, 0x07 + beq I, ZERO, .L_M0 +.align 5 +.L_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + fld.s F4, S5, 0x00 + fld.s F5, S6, 0x00 + fld.s F6, S7, 0x00 + fld.s F7, S8, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + fst.s F4, TD, 0x10 + PTR_ADDI S5, S5, 0x04 + fst.s F5, TD, 0x14 + PTR_ADDI S6, S6, 0x04 + fst.s F6, TD, 0x18 + PTR_ADDI S7, S7, 0x04 + fst.s F7, TD, 0x1C + PTR_ADDI S8, S8, 0x04 + + PTR_ADDI TD, TD, 0x20 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_M1 +.L_M0: + blt ZERO, J, .L_N8 +.L_N7: + andi J, N, 0x07 + beq ZERO, J, .L_N0 + + andi J, N, 0x04 + beq ZERO, J, .L_N3 +.L_N4: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x02 + PTR_ADD S3, S2, TL + PTR_ADD S4, S2, T0 + PTR_ADD TS, S3, T0 + beq I, ZERO, .L_N4_M3 +.align 5 +.L_N4_M4: + GLD v, , $vr0, S1, 0, $vr1, S2, 0, $vr2, S3, 0, $vr3, S4, 0 + GSBUTTERFLY v, w, $vr4, $vr5, $vr2, $vr0 + GSBUTTERFLY v, w, $vr6, $vr7, $vr3, $vr1 + GSBUTTERFLY v, w, $vr0, $vr1, $vr6, $vr4 + GSBUTTERFLY v, w, $vr2, $vr3, $vr7, $vr5 + GST v, , $vr0, TD, 0x00, $vr1, TD, 0x10, $vr2, TD, 0x20, $vr3, TD, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI TD, TD, 0x40 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M4 +.L_N4_M3: + andi I, M, 0x03 + beq I, ZERO, .L_N3 +.align 5 +.L_N4_M1: + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + fld.s F2, S3, 0x00 + fld.s F3, S4, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + fst.s F2, TD, 0x08 + PTR_ADDI S3, S3, 0x04 + fst.s F3, TD, 0x0C + PTR_ADDI S4, S4, 0x04 + + PTR_ADDI TD, TD, 0x10 + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N4_M1 +.L_N3: + andi J, N, 0x03 + beq ZERO, J, .L_N0 + + andi J, N, 0x02 + beq ZERO, J, .L_N1 +.L_N2: + move S1, TS + PTR_ADD S2, TS, TL + PTR_SRAI I, M, 0x01 + PTR_ADD TS, S2, TL + beq I, ZERO, .L_N2_M1 +.align 5 +.L_N2_M2: + GLD f, d, F0, S1, 0x00, F1, S2, 0x00 + vilvl.w $vr0, $vr1, $vr0 + GST v, , $vr0, TD, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI TD, TD, 0x10 + + PTR_ADDI I, I, -1 + blt ZERO, I, .L_N2_M2 +.L_N2_M1: + andi I, M, 0x01 + beq I, ZERO, .L_N1 + + fld.s F0, S1, 0x00 + fld.s F1, S2, 0x00 + + fst.s F0, TD, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F1, TD, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI TD, TD, 0x08 +.align 5 +.L_N1: + move S1, TS + beq ZERO, M, .L_N0 +.L_N1_M1: + fld.s F0, S1, 0x00 + PTR_ADDI S1, S1, 0x04 + fst.s F0, TD, 0x00 + PTR_ADDI TD, TD, 0x04 + PTR_ADDI M, M, -1 + blt ZERO, M, .L_N1_M1 +.L_N0: + pop_if_used 17, 20 + jirl $r0, $r1, 0x0 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_16_lasx.S b/kernel/loongarch64/sgemm_tcopy_16_lasx.S new file mode 100644 index 000000000..d9789bdcd --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_16_lasx.S @@ -0,0 +1,526 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define P5 $r27 +#define T0 $r28 +#define T1 $r29 +#define TL $r7 +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N16: | +//| .L_N15: | +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N16: +//| .L_M4_N15: +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N16: +//| .L_M2_N15: +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N16: +//| .L_M1_N15: +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 24, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T0 + PTR_MUL P3, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_SLLI P3, P3, 0x02 + PTR_ADD P2, DST, P2 + PTR_ADD P3, DST, P3 + + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P4, M, T0 + PTR_MUL P5, M, T1 + PTR_SLLI P4, P4, 0x02 + PTR_SLLI P5, P5, 0x02 + PTR_ADD P4, DST, P4 + PTR_ADD P5, DST, P5 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x06 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x200 + + PTR_SRAI I, N, 0x04 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N15 +.L_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + xvld U0, S5, 0x00 + xvld U1, S5, 0x20 + xvld U2, S6, 0x00 + xvld U3, S6, 0x20 + + xvst U0, P1, 0x100 + xvst U1, P1, 0x120 + xvst U2, P1, 0x140 + xvst U3, P1, 0x160 + + xvld U4, S7, 0x00 + xvld U5, S7, 0x20 + xvld U6, S8, 0x00 + xvld U7, S8, 0x20 + + xvst U4, P1, 0x180 + xvst U5, P1, 0x1A0 + xvst U6, P1, 0x1C0 + xvst U7, P1, 0x1E0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI S5, S5, 0x40 + PTR_ADDI S6, S6, 0x40 + PTR_ADDI S7, S7, 0x40 + PTR_ADDI S8, S8, 0x40 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N16 +.L_N15: + andi I, N, 0x08 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60, \ + U4, P2, 0x80, U5, P2, 0xA0, U6, P2, 0xC0, U7, P2, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + PTR_ADDI P2, P2, 0x100 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30, \ + $vr4, P3, 0x40, $vr5, P3, 0x50, $vr6, P3, 0x60, $vr7, P3, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P3, P3, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18, \ + $f4, P4, 0x20, $f5, P4, 0x28, $f6, P4, 0x30, $f7, P4, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P4, P4, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C, \ + $f4, P5, 0x10, $f5, P5, 0x14, $f6, P5, 0x18, $f7, P5, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P5, P5, 0x20 +.L_N0: + blt ZERO, J, .L_M8 +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M4_N15 +.align 5 +.L_M4_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + xvld U4, S3, 0x00 + xvld U5, S3, 0x20 + xvld U6, S4, 0x00 + xvld U7, S4, 0x20 + + xvst U4, P1, 0x80 + xvst U5, P1, 0xA0 + xvst U6, P1, 0xC0 + xvst U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI S3, S3, 0x40 + PTR_ADDI S4, S4, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N16 +.L_M4_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M4_N7 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20, U2, P2, 0x40, U3, P2, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI P2, P2, 0x80 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10, $vr2, P3, 0x20, $vr3, P3, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P3, P3, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08, $f2, P4, 0x10, $f3, P4, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P4, P4, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04, $f2, P5, 0x08, $f3, P5, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P5, P5, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M2_N15 +.align 5 +.L_M2_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + xvld U2, S2, 0x00 + xvld U3, S2, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + xvst U2, P1, 0x40 + xvst U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI S2, S2, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N16 +.L_M2_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M2_N7 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P2, 0x00, U1, P2, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI P2, P2, 0x40 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P3, 0x00, $vr1, P3, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P3, P3, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P4, 0x00, $f1, P4, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P4, P4, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P5, 0x00, $f1, P5, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P5, P5, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x04 + beq ZERO, I, .L_M1_N15 +.align 5 +.L_M1_N16: + xvld U0, S1, 0x00 + xvld U1, S1, 0x20 + + xvst U0, P1, 0x00 + xvst U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x40 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N16 +.L_M1_N15: + andi I, N, 0x08 + beq ZERO, I, .L_M1_N7 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P2, 0x00 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI P2, P2, 0x20 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P3, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P3, P3, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P4, P4, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P5, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P5, P5, 0x04 +.L_M0: + pop_if_used 24, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/sgemm_tcopy_8_lasx.S b/kernel/loongarch64/sgemm_tcopy_8_lasx.S new file mode 100644 index 000000000..725a47a60 --- /dev/null +++ b/kernel/loongarch64/sgemm_tcopy_8_lasx.S @@ -0,0 +1,406 @@ +/******************************************************************************* +Copyright (c) 2023, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ +#define ASSEMBLER + +#include "common.h" +#include "loongarch64_asm.S" + +/********************************************************************* +* 2023/08/23 guxiwei +* UTEST : OK +* CTEST : OK +* TEST : OK +*********************************************************************/ + +/* Function parameters */ +#define M $r4 // param 1: m +#define N $r5 // param 2: n +#define SRC $r6 // param 3: src +#define LDA $r7 // param 4: lda +#define DST $r8 // param 5: dst + +#define I $r9 +#define J $r10 +#define S0 $r11 +#define S1 $r12 +#define S2 $r13 +#define S3 $r14 +#define S4 $r15 +#define S5 $r16 +#define S6 $r17 +#define S7 $r18 +#define S8 $r19 +#define P0 $r20 +#define P1 $r23 +#define P2 $r24 +#define P3 $r25 +#define P4 $r26 +#define T0 $r27 +#define T1 $r28 +#define TL $r7 +#undef ZERO +#define ZERO $r0 + +/* LASX vectors */ +#define U0 $xr0 +#define U1 $xr1 +#define U2 $xr2 +#define U3 $xr3 +#define U4 $xr4 +#define U5 $xr5 +#define U6 $xr6 +#define U7 $xr7 + +// Loops outline +//.L_M8 <------------------- +//| .L_N8: | +//| .L_N7: | Main Loop +//| .L_N4: | +//| .L_N3: | +//| .L_N2: | +//| .L_N1: | +//| .L_N0: --------------- +//.L_M7 +//.L_M4 +//| .L_M4_N8: +//| .L_M4_N7: +//| .L_M4_N4: +//| .L_M4_N3: +//| .L_M4_N2: +//| .L_M4_N1: +//.L_M3 +//.L_M2 +//| .L_M2_N8: +//| .L_M2_N7: +//| .L_M2_N4: +//| .L_M2_N3: +//| .L_M2_N2: +//| .L_M2_N1: +//.L_M1 +//| .L_M1_N8: +//| .L_M1_N7: +//| .L_M1_N4: +//| .L_M1_N3: +//| .L_M1_N2: +//| .L_M1_N1: +//.L_M0 + + PROLOGUE + push_if_used 23, 8 + + move S0, SRC + move P0, DST + + PTR_SRAI T0, N, 0x04 + PTR_SRAI T1, N, 0x03 + PTR_SLLI T0, T0, 0x04 + PTR_SLLI T1, T1, 0x03 + + PTR_MUL P2, M, T1 + PTR_SLLI P2, P2, 0x02 + PTR_ADD P2, DST, P2 + PTR_SRAI T0, N, 0x02 + PTR_SRAI T1, N, 0x01 + PTR_SLLI T0, T0, 0x02 + PTR_SLLI T1, T1, 0x01 + PTR_MUL P3, M, T0 + PTR_MUL P4, M, T1 + PTR_SLLI P3, P3, 0x02 + PTR_SLLI P4, P4, 0x02 + PTR_ADD P3, DST, P3 + PTR_ADD P4, DST, P4 + + PTR_SLLI TL, LDA, 0x02 + PTR_SRAI J, M, 0x03 + PTR_SLLI T0, TL, 0x01 + PTR_SLLI T1, M, 0x05 + beq ZERO, J, .L_M7 +.align 5 +.L_M8: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S5, S3, T0 + PTR_ADD S6, S4, T0 + PTR_ADD S7, S5, T0 + PTR_ADD S8, S6, T0 + PTR_ADD S0, S7, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x100 + + PTR_SRAI I, N, 0x03 + PTR_ADDI J, J, -1 + beq ZERO, I, .L_N7 +.L_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + xvld U4, S5, 0x00 + xvld U5, S6, 0x00 + xvld U6, S7, 0x00 + xvld U7, S8, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60, \ + U4, P1, 0x80, U5, P1, 0xA0, U6, P1, 0xC0, U7, P1, 0xE0 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + PTR_ADDI S5, S5, 0x20 + PTR_ADDI S6, S6, 0x20 + PTR_ADDI S7, S7, 0x20 + PTR_ADDI S8, S8, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_N8 +.L_N7: + andi I, N, 0x04 + beq ZERO, I, .L_N3 +.L_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00, \ + $vr4, S5, 0x00, $vr5, S6, 0x00, $vr6, S7, 0x00, $vr7, S8, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30, \ + $vr4, P2, 0x40, $vr5, P2, 0x50, $vr6, P2, 0x60, $vr7, P2, 0x70 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI S5, S5, 0x10 + PTR_ADDI S6, S6, 0x10 + PTR_ADDI S7, S7, 0x10 + PTR_ADDI S8, S8, 0x10 + PTR_ADDI P2, P2, 0x80 +.L_N3: + andi I, N, 0x02 + beq ZERO, I, .L_N1 +.L_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18, \ + $f4, P3, 0x20, $f5, P3, 0x28, $f6, P3, 0x30, $f7, P3, 0x38 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI S5, S5, 0x08 + PTR_ADDI S6, S6, 0x08 + PTR_ADDI S7, S7, 0x08 + PTR_ADDI S8, S8, 0x08 + PTR_ADDI P3, P3, 0x40 +.L_N1: + andi I, N, 0x01 + beq ZERO, I, .L_N0 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00, \ + $f4, S5, 0x00, $f5, S6, 0x00, $f6, S7, 0x00, $f7, S8, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C, \ + $f4, P4, 0x10, $f5, P4, 0x14, $f6, P4, 0x18, $f7, P4, 0x1C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI S5, S5, 0x04 + PTR_ADDI S6, S6, 0x04 + PTR_ADDI S7, S7, 0x04 + PTR_ADDI S8, S8, 0x04 + PTR_ADDI P4, P4, 0x20 +.L_N0: + blt ZERO, J, .L_M8 + +.L_M7: + andi J, M, 0x04 + beq ZERO, J, .L_M3 +.L_M4: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S3, S1, T0 + PTR_ADD S4, S2, T0 + PTR_ADD S0, S3, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x80 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M4_N7 +.align 5 +.L_M4_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + xvld U2, S3, 0x00 + xvld U3, S4, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20, U2, P1, 0x40, U3, P1, 0x60 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI S3, S3, 0x20 + PTR_ADDI S4, S4, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M4_N8 +.L_M4_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M4_N3 +.L_M4_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00, $vr2, S3, 0x00, $vr3, S4, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10, $vr2, P2, 0x20, $vr3, P2, 0x30 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI S3, S3, 0x10 + PTR_ADDI S4, S4, 0x10 + PTR_ADDI P2, P2, 0x40 +.L_M4_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M4_N1 +.L_M4_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08, $f2, P3, 0x10, $f3, P3, 0x18 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI S3, S3, 0x08 + PTR_ADDI S4, S4, 0x08 + PTR_ADDI P3, P3, 0x20 +.L_M4_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M3 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00, $f2, S3, 0x00, $f3, S4, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04, $f2, P4, 0x08, $f3, P4, 0x0C + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI S3, S3, 0x04 + PTR_ADDI S4, S4, 0x04 + PTR_ADDI P4, P4, 0x10 +.L_M3: + andi J, M, 0x02 + beq ZERO, J, .L_M1 +.L_M2: + move S1, S0 + PTR_ADD S2, S0, TL + PTR_ADD S0, S0, T0 + + move P1, P0 + PTR_ADDI P0, P0, 0x40 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M2_N7 +.align 5 +.L_M2_N8: + xvld U0, S1, 0x00 + xvld U1, S2, 0x00 + + GST xv, , U0, P1, 0x00, U1, P1, 0x20 + + PTR_ADDI S1, S1, 0x20 + PTR_ADDI S2, S2, 0x20 + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M2_N8 +.L_M2_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M2_N3 +.L_M2_N4: + GLD v, , $vr0, S1, 0x00, $vr1, S2, 0x00 + GST v, , $vr0, P2, 0x00, $vr1, P2, 0x10 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI S2, S2, 0x10 + PTR_ADDI P2, P2, 0x20 +.L_M2_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M2_N1 +.L_M2_N2: + GLD f, d, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, d, $f0, P3, 0x00, $f1, P3, 0x08 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI S2, S2, 0x08 + PTR_ADDI P3, P3, 0x10 +.L_M2_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M1 + + GLD f, s, $f0, S1, 0x00, $f1, S2, 0x00 + GST f, s, $f0, P4, 0x00, $f1, P4, 0x04 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI S2, S2, 0x04 + PTR_ADDI P4, P4, 0x08 +.L_M1: + andi J, M, 0x01 + beq ZERO, J, .L_M0 + + move S1, S0 + PTR_ADD S2, S0, TL + + move P1, P0 + PTR_ADDI P0, P0, 0x20 + + PTR_SRAI I, N, 0x03 + beq ZERO, I, .L_M1_N7 +.align 5 +.L_M1_N8: + xvld U0, S1, 0x00 + + GST xv, , U0, P1, 0x00 + + PTR_ADDI S1, S1, 0x20 + + PTR_ADDI I, I, -1 + PTR_ADD P1, P1, T1 + blt ZERO, I, .L_M1_N8 +.L_M1_N7: + andi I, N, 0x04 + beq ZERO, I, .L_M1_N3 +.L_M1_N4: + GLD v, , $vr0, S1, 0x00 + GST v, , $vr0, P2, 0x00 + PTR_ADDI S1, S1, 0x10 + PTR_ADDI P2, P2, 0x10 +.L_M1_N3: + andi I, N, 0x02 + beq ZERO, I, .L_M1_N1 +.L_M1_N2: + GLD f, d, $f0, S1, 0x00 + GST f, d, $f0, P3, 0x00 + PTR_ADDI S1, S1, 0x08 + PTR_ADDI P3, P3, 0x08 +.L_M1_N1: + andi I, N, 0x01 + beq ZERO, I, .L_M0 + + GLD f, s, $f0, S1, 0x00 + GST f, s, $f0, P4, 0x00 + PTR_ADDI S1, S1, 0x04 + PTR_ADDI P4, P4, 0x04 +.L_M0: + pop_if_used 23, 8 + jirl $r0, $r1, 0x00 + EPILOGUE diff --git a/kernel/loongarch64/snrm2.S b/kernel/loongarch64/snrm2.S index 57c21a017..8c5c91ade 100644 --- a/kernel/loongarch64/snrm2.S +++ b/kernel/loongarch64/snrm2.S @@ -61,7 +61,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmov.d s2, s1 bge $r0, N, .L999 slli.d INCX, INCX, BASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 srai.d I, N, 3 bne INCX, TEMP, .L20 bge $r0, I, .L15 diff --git a/kernel/loongarch64/znrm2.S b/kernel/loongarch64/znrm2.S index 49f640268..8e2165ab7 100644 --- a/kernel/loongarch64/znrm2.S +++ b/kernel/loongarch64/znrm2.S @@ -64,7 +64,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. MTC s1, $r0 bge $r0, N, .L999 slli.d INCX, INCX, ZBASE_SHIFT - bge $r0, INCX, .L999 + beq $r0, INCX, .L999 move XX, X MOV s2, s1 srai.d I, N, 2 diff --git a/kernel/mips/nrm2.c b/kernel/mips/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/mips/nrm2.c +++ b/kernel/mips/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/mips/znrm2.c b/kernel/mips/znrm2.c index 85be39cd1..d11a6bd4a 100644 --- a/kernel/mips/znrm2.c +++ b/kernel/mips/znrm2.c @@ -48,7 +48,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/mips64/cnrm2.S b/kernel/mips64/cnrm2.S index 76fa9c295..159f9bea9 100644 --- a/kernel/mips64/cnrm2.S +++ b/kernel/mips64/cnrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT dsra I, N, 2 diff --git a/kernel/mips64/dnrm2.S b/kernel/mips64/dnrm2.S index cd40414a2..1b55d9fc3 100644 --- a/kernel/mips64/dnrm2.S +++ b/kernel/mips64/dnrm2.S @@ -81,7 +81,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT move XX, X diff --git a/kernel/mips64/snrm2.S b/kernel/mips64/snrm2.S index 1ba061a7d..f18151b5c 100644 --- a/kernel/mips64/snrm2.S +++ b/kernel/mips64/snrm2.S @@ -77,7 +77,7 @@ blez N, .L999 mov.d s2, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, BASE_SHIFT bne INCX, TEMP, .L20 diff --git a/kernel/mips64/znrm2.S b/kernel/mips64/znrm2.S index 1c247bca9..d33284a47 100644 --- a/kernel/mips64/znrm2.S +++ b/kernel/mips64/znrm2.S @@ -80,7 +80,7 @@ blez N, .L999 MTC $0, s1 - blez INCX, .L999 + beqz INCX, .L999 dsll INCX, INCX, ZBASE_SHIFT move XX, X diff --git a/kernel/power/cnrm2.S b/kernel/power/cnrm2.S index c115650fd..74117a831 100644 --- a/kernel/power/cnrm2.S +++ b/kernel/power/cnrm2.S @@ -99,7 +99,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/cnrm2_hummer.S b/kernel/power/cnrm2_hummer.S index 46c29c654..0d036b32f 100644 --- a/kernel/power/cnrm2_hummer.S +++ b/kernel/power/cnrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) andi. r0, X, 2 * SIZE - 1 bne LL(100) diff --git a/kernel/power/cnrm2_ppc440.S b/kernel/power/cnrm2_ppc440.S index c71c34b7c..8e3abf9f9 100644 --- a/kernel/power/cnrm2_ppc440.S +++ b/kernel/power/cnrm2_ppc440.S @@ -104,7 +104,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 sub X, X, INCX diff --git a/kernel/power/dgemm_small_kernel_nn_power10.c b/kernel/power/dgemm_small_kernel_nn_power10.c index ecdc3e5c6..73f6d5b99 100644 --- a/kernel/power/dgemm_small_kernel_nn_power10.c +++ b/kernel/power/dgemm_small_kernel_nn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); diff --git a/kernel/power/dgemm_small_kernel_tn_power10.c b/kernel/power/dgemm_small_kernel_tn_power10.c index 93a942b02..426948185 100644 --- a/kernel/power/dgemm_small_kernel_tn_power10.c +++ b/kernel/power/dgemm_small_kernel_tn_power10.c @@ -167,7 +167,7 @@ typedef __vector unsigned char vec_t; #define INIT_1ACC() __builtin_mma_xxsetaccz(&acc0); -#if (defined(__GNUC__) && (__GNUC__ == 10)) +#if (defined(__GNUC__) && (__GNUC__ == 10 || (__GNUC__ == 11 && __GNUC_MINOR__ <= 2))) #if defined(_AIX) #define LOAD_PAIR(pair, v0, v1) \ __builtin_vsx_assemble_pair(&pair, (vec_t)v0, (vec_t)v1); diff --git a/kernel/power/dnrm2_hummer.S b/kernel/power/dnrm2_hummer.S index 4931f5ab1..8638ca424 100644 --- a/kernel/power/dnrm2_hummer.S +++ b/kernel/power/dnrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/dnrm2_ppc440.S b/kernel/power/dnrm2_ppc440.S index 849ca1f35..529f6adf0 100644 --- a/kernel/power/dnrm2_ppc440.S +++ b/kernel/power/dnrm2_ppc440.S @@ -111,7 +111,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X diff --git a/kernel/power/nrm2.S b/kernel/power/nrm2.S index d9e1f4e9a..880b5d1b4 100644 --- a/kernel/power/nrm2.S +++ b/kernel/power/nrm2.S @@ -113,7 +113,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/snrm2.S b/kernel/power/snrm2.S index be974cc48..696d404bb 100644 --- a/kernel/power/snrm2.S +++ b/kernel/power/snrm2.S @@ -97,7 +97,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/snrm2_hummer.S b/kernel/power/snrm2_hummer.S index a0ff3d1b2..a4292af78 100644 --- a/kernel/power/snrm2_hummer.S +++ b/kernel/power/snrm2_hummer.S @@ -119,7 +119,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) cmpwi cr0, INCX, SIZE bne LL(100) diff --git a/kernel/power/snrm2_ppc440.S b/kernel/power/snrm2_ppc440.S index 0a80d1224..3547d7f47 100644 --- a/kernel/power/snrm2_ppc440.S +++ b/kernel/power/snrm2_ppc440.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) fmr f0, f1 fmr f2, f1 diff --git a/kernel/power/znrm2.S b/kernel/power/znrm2.S index 60f379d25..3048e3480 100644 --- a/kernel/power/znrm2.S +++ b/kernel/power/znrm2.S @@ -105,7 +105,7 @@ cmpwi cr0, N, 0 ble- LL(9999) cmpwi cr0, INCX, 0 - ble- LL(9999) + beq- LL(9999) mr NN, N mr XX, X diff --git a/kernel/power/znrm2_hummer.S b/kernel/power/znrm2_hummer.S index 1d0c598f8..4ef2212df 100644 --- a/kernel/power/znrm2_hummer.S +++ b/kernel/power/znrm2_hummer.S @@ -134,7 +134,7 @@ cmpwi cr0, N, 0 ble LL(99) cmpwi cr0, INCX, 0 - ble LL(99) + beq LL(99) mr XX, X diff --git a/kernel/power/znrm2_ppc440.S b/kernel/power/znrm2_ppc440.S index 778b805de..f775c3e62 100644 --- a/kernel/power/znrm2_ppc440.S +++ b/kernel/power/znrm2_ppc440.S @@ -112,7 +112,7 @@ cmpwi cr0, N, 0 ble- LL(999) cmpwi cr0, INCX, 0 - ble- LL(999) + beq- LL(999) mr NN, N mr XX, X diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V index 18cb3bafd..e6f2b3314 100644 --- a/kernel/riscv64/KERNEL.C910V +++ b/kernel/riscv64/KERNEL.C910V @@ -59,6 +59,7 @@ SDOTKERNEL = dot_vector.c DDOTKERNEL = dot_vector.c CDOTKERNEL = zdot_vector.c ZDOTKERNEL = zdot_vector.c +DSDOTKERNEL = ../generic/dot.c SNRM2KERNEL = nrm2_vector.c DNRM2KERNEL = nrm2_vector.c diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c index fcff09337..8cc189fe3 100644 --- a/kernel/riscv64/nrm2.c +++ b/kernel/riscv64/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c index fc1c8b54a..28bb0eda5 100644 --- a/kernel/riscv64/znrm2.c +++ b/kernel/riscv64/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n <= 0 || inc_x <= 0) return(0.0); + if (n <= 0 || inc_x == 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/sparc/cnrm2.S b/kernel/sparc/cnrm2.S index 8dc4b56b6..0840c8848 100644 --- a/kernel/sparc/cnrm2.S +++ b/kernel/sparc/cnrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, ZBASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/dnrm2.S b/kernel/sparc/dnrm2.S index cf7522953..41e993440 100644 --- a/kernel/sparc/dnrm2.S +++ b/kernel/sparc/dnrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, BASE_SHIFT, INCX add %sp, -8, %sp diff --git a/kernel/sparc/snrm2.S b/kernel/sparc/snrm2.S index a80247259..a7405b6e1 100644 --- a/kernel/sparc/snrm2.S +++ b/kernel/sparc/snrm2.S @@ -76,7 +76,7 @@ FMOV c1, t4 cmp INCX, 0 - ble .LL20 + beq .LL20 sll INCX, BASE_SHIFT, INCX cmp N, 0 diff --git a/kernel/sparc/znrm2.S b/kernel/sparc/znrm2.S index 065d22784..dae53ffe7 100644 --- a/kernel/sparc/znrm2.S +++ b/kernel/sparc/znrm2.S @@ -107,7 +107,7 @@ FMOV fzero, c1 cmp INCX, 0 - ble .LL99 + beq .LL99 sll INCX, ZBASE_SHIFT, INCX add %sp, -8, %sp diff --git a/kernel/x86/nrm2.S b/kernel/x86/nrm2.S index 7a14da862..3a6417462 100644 --- a/kernel/x86/nrm2.S +++ b/kernel/x86/nrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $BASE_SHIFT, INCX fldz diff --git a/kernel/x86/nrm2_sse.S b/kernel/x86/nrm2_sse.S index 0f174c408..129b41a03 100644 --- a/kernel/x86/nrm2_sse.S +++ b/kernel/x86/nrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 leal (, INCX, SIZE), INCX cmpl $SIZE, INCX diff --git a/kernel/x86/znrm2.S b/kernel/x86/znrm2.S index 263612e9a..7a65df77a 100644 --- a/kernel/x86/znrm2.S +++ b/kernel/x86/znrm2.S @@ -78,7 +78,7 @@ testl M, M jle .L999 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX fldz diff --git a/kernel/x86/znrm2_sse.S b/kernel/x86/znrm2_sse.S index bbc3677ae..4ad326120 100644 --- a/kernel/x86/znrm2_sse.S +++ b/kernel/x86/znrm2_sse.S @@ -69,7 +69,7 @@ jle .L999 pxor %xmm1, %xmm1 testl INCX, INCX - jle .L999 + je .L999 sall $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c index b398aa6e1..d261962de 100644 --- a/kernel/x86_64/casum_microk_skylakex-2.c +++ b/kernel/x86_64/casum_microk_skylakex-2.c @@ -1,5 +1,10 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && (__clang_major__ >= 9 &&__clang_major__ !=17)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) #define HAVE_CASUM_KERNEL 1 @@ -347,3 +352,4 @@ static FLOAT casum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif diff --git a/kernel/x86_64/cscal_microk_skylakex-2.c b/kernel/x86_64/cscal_microk_skylakex-2.c index 8a622427b..a6c012a4c 100644 --- a/kernel/x86_64/cscal_microk_skylakex-2.c +++ b/kernel/x86_64/cscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dasum_microk_haswell-2.c b/kernel/x86_64/dasum_microk_haswell-2.c index fd9da7ebe..bc27c7647 100644 --- a/kernel/x86_64/dasum_microk_haswell-2.c +++ b/kernel/x86_64/dasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL diff --git a/kernel/x86_64/dasum_microk_skylakex-2.c b/kernel/x86_64/dasum_microk_skylakex-2.c index 83bc078b3..76b9fbef0 100644 --- a/kernel/x86_64/dasum_microk_skylakex-2.c +++ b/kernel/x86_64/dasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DASUM_KERNEL 1 diff --git a/kernel/x86_64/daxpy_microk_skylakex-2.c b/kernel/x86_64/daxpy_microk_skylakex-2.c index e785a39f1..5b9147d10 100644 --- a/kernel/x86_64/daxpy_microk_skylakex-2.c +++ b/kernel/x86_64/daxpy_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/ddot_microk_skylakex-2.c b/kernel/x86_64/ddot_microk_skylakex-2.c index 8eabf225a..f076862f7 100644 --- a/kernel/x86_64/ddot_microk_skylakex-2.c +++ b/kernel/x86_64/ddot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || ( defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_8 1 diff --git a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c index a98772b94..da57a18a7 100644 --- a/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c index 37d1ca497..69ad6d94e 100644 --- a/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/dgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,10 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include #include "common.h" diff --git a/kernel/x86_64/dgemv_n_microk_skylakex-4.c b/kernel/x86_64/dgemv_n_microk_skylakex-4.c index 4030399ab..4e8739864 100644 --- a/kernel/x86_64/dgemv_n_microk_skylakex-4.c +++ b/kernel/x86_64/dgemv_n_microk_skylakex-4.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_4x4 1 diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c index 4e862e663..bf9c044d4 100644 --- a/kernel/x86_64/drot_microk_skylakex-2.c +++ b/kernel/x86_64/drot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_DROT_KERNEL 1 diff --git a/kernel/x86_64/dscal_microk_skylakex-2.c b/kernel/x86_64/dscal_microk_skylakex-2.c index e0598272e..381136414 100644 --- a/kernel/x86_64/dscal_microk_skylakex-2.c +++ b/kernel/x86_64/dscal_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c index f0df5aaa8..ca4773a4b 100644 --- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c +++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c @@ -27,7 +27,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #include diff --git a/kernel/x86_64/dtobf16_microk_cooperlake.c b/kernel/x86_64/dtobf16_microk_cooperlake.c index 9b8ac4714..b713b39be 100644 --- a/kernel/x86_64/dtobf16_microk_cooperlake.c +++ b/kernel/x86_64/dtobf16_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/nrm2.S b/kernel/x86_64/nrm2.S index b79ac2adb..61cf8c452 100644 --- a/kernel/x86_64/nrm2.S +++ b/kernel/x86_64/nrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $BASE_SHIFT, INCX diff --git a/kernel/x86_64/nrm2_sse.S b/kernel/x86_64/nrm2_sse.S index 33b1ee496..c1f3a45fc 100644 --- a/kernel/x86_64/nrm2_sse.S +++ b/kernel/x86_64/nrm2_sse.S @@ -57,7 +57,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 pxor %xmm2, %xmm2 leaq (, INCX, SIZE), INCX diff --git a/kernel/x86_64/sasum_microk_haswell-2.c b/kernel/x86_64/sasum_microk_haswell-2.c index 2eb5b9538..3b4d65cfc 100644 --- a/kernel/x86_64/sasum_microk_haswell-2.c +++ b/kernel/x86_64/sasum_microk_haswell-2.c @@ -1,4 +1,7 @@ -#if (( defined(__GNUC__) && __GNUC__ > 6 ) || (defined(__clang__) && __clang_major__ >= 6)) && defined(__AVX2__) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/sasum_microk_skylakex-2.c b/kernel/x86_64/sasum_microk_skylakex-2.c index fbc91b558..f193053ee 100644 --- a/kernel/x86_64/sasum_microk_skylakex-2.c +++ b/kernel/x86_64/sasum_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SASUM_KERNEL 1 diff --git a/kernel/x86_64/saxpy_microk_skylakex-2.c b/kernel/x86_64/saxpy_microk_skylakex-2.c index 950f10ba2..bbe4d2bc5 100644 --- a/kernel/x86_64/saxpy_microk_skylakex-2.c +++ b/kernel/x86_64/saxpy_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sbdot_microk_cooperlake.c b/kernel/x86_64/sbdot_microk_cooperlake.c index 2aefe46ff..ccec98e34 100644 --- a/kernel/x86_64/sbdot_microk_cooperlake.c +++ b/kernel/x86_64/sbdot_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBDOT_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c index d875e0d96..c87f9fa5b 100644 --- a/kernel/x86_64/sbgemv_n_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_N_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c index 23da2e809..5b7a2e147 100644 --- a/kernel/x86_64/sbgemv_t_microk_cooperlake.c +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SBGEMV_T_ACCL_KERNEL 1 diff --git a/kernel/x86_64/sdot_microk_skylakex-2.c b/kernel/x86_64/sdot_microk_skylakex-2.c index 1fcb7f27c..f14632f94 100644 --- a/kernel/x86_64/sdot_microk_skylakex-2.c +++ b/kernel/x86_64/sdot_microk_skylakex-2.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_KERNEL_16 1 diff --git a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c index 2366fe3aa..6f4309c30 100644 --- a/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_nn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c index 308f5e35e..987b090ba 100644 --- a/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c +++ b/kernel/x86_64/sgemm_small_kernel_tn_skylakex.c @@ -24,7 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include #include "common.h" diff --git a/kernel/x86_64/sgemv_n_microk_skylakex-8.c b/kernel/x86_64/sgemv_n_microk_skylakex-8.c index fba9cedcd..199621712 100644 --- a/kernel/x86_64/sgemv_n_microk_skylakex-8.c +++ b/kernel/x86_64/sgemv_n_microk_skylakex-8.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && __NVCOMPVERS >= 2203 ) + #define HAVE_SGEMV_N_SKYLAKE_KERNEL 1 #include "common.h" @@ -255,4 +259,4 @@ static int sgemv_kernel_n_64(BLASLONG m, BLASLONG n, float alpha, float *a, BLAS } -#endif \ No newline at end of file +#endif diff --git a/kernel/x86_64/sgemv_t_microk_skylakex.c b/kernel/x86_64/sgemv_t_microk_skylakex.c index dca12acfc..d4f675a1e 100644 --- a/kernel/x86_64/sgemv_t_microk_skylakex.c +++ b/kernel/x86_64/sgemv_t_microk_skylakex.c @@ -26,7 +26,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SGEMV_T_SKYLAKE_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c index a21d1cf64..aec25ac56 100644 --- a/kernel/x86_64/srot_microk_skylakex-2.c +++ b/kernel/x86_64/srot_microk_skylakex-2.c @@ -1,5 +1,8 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) #define HAVE_SROT_KERNEL 1 diff --git a/kernel/x86_64/sscal_microk_skylakex-2.c b/kernel/x86_64/sscal_microk_skylakex-2.c index c4fa160f0..5c13cba55 100644 --- a/kernel/x86_64/sscal_microk_skylakex-2.c +++ b/kernel/x86_64/sscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include diff --git a/kernel/x86_64/stobf16_microk_cooperlake.c b/kernel/x86_64/stobf16_microk_cooperlake.c index 2756a6934..e7d20ddfa 100644 --- a/kernel/x86_64/stobf16_microk_cooperlake.c +++ b/kernel/x86_64/stobf16_microk_cooperlake.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #define HAVE_TOBF16_ACCL_KERNEL 1 #include "common.h" diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c index e257a5456..dddf03fe2 100644 --- a/kernel/x86_64/zasum_microk_skylakex-2.c +++ b/kernel/x86_64/zasum_microk_skylakex-2.c @@ -1,5 +1,10 @@ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if ((( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && ( __clang_major__ >= 9 && __clang_major__ != 17)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2309))) + +#if (!(defined(__NVCOMPILER) && NVCOMPVERS < 2309)) #define HAVE_ZASUM_KERNEL 1 @@ -338,3 +343,4 @@ static FLOAT zasum_kernel(BLASLONG n, FLOAT *x) return sumf; } #endif +#endif diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index a7dd054fb..3776c8910 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1418,10 +1418,10 @@ movq M, %rax //If incx==0 || incy==0, avoid unloop and jump to end. cmpq $0, INCX - je .L58 + jne .L59 cmpq $0, INCY je .L58 - +.L59: sarq $3, %rax jle .L55 diff --git a/kernel/x86_64/znrm2.S b/kernel/x86_64/znrm2.S index 0d2aa3480..748fde310 100644 --- a/kernel/x86_64/znrm2.S +++ b/kernel/x86_64/znrm2.S @@ -58,7 +58,7 @@ testq M, M jle .L999 testq INCX, INCX - jle .L999 + je .L999 salq $ZBASE_SHIFT, INCX diff --git a/kernel/x86_64/znrm2_sse.S b/kernel/x86_64/znrm2_sse.S index f78b83f7e..2274f2e98 100644 --- a/kernel/x86_64/znrm2_sse.S +++ b/kernel/x86_64/znrm2_sse.S @@ -58,7 +58,7 @@ jle .L999 pxor %xmm1, %xmm1 testq INCX, INCX - jle .L999 + je .L999 xorq FLAG, FLAG diff --git a/kernel/x86_64/zscal_microk_skylakex-2.c b/kernel/x86_64/zscal_microk_skylakex-2.c index f9e05e333..29dc4f6df 100644 --- a/kernel/x86_64/zscal_microk_skylakex-2.c +++ b/kernel/x86_64/zscal_microk_skylakex-2.c @@ -26,7 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ /* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#ifdef __NVCOMPILER +#define NVCOMPVERS ( __NVCOMPILER_MAJOR__ * 100 + __NVCOMPILER_MINOR__ ) +#endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) || (defined(__NVCOMPILER) && NVCOMPVERS >= 2203 ) + #include diff --git a/lapack-netlib/INSTALL/lsametst.c b/lapack-netlib/INSTALL/lsametst.c index 4b46115fc..631733841 100644 --- a/lapack-netlib/INSTALL/lsametst.c +++ b/lapack-netlib/INSTALL/lsametst.c @@ -426,7 +426,7 @@ static integer c__3 = 3; /* December 2016 */ /* ===================================================================== */ -/* Main program */ main(void) +/* Main program */ int main(void) { /* Format strings */ static char fmt_9999[] = "(\002 *** Error: LSAME( \002,a1,\002, \002," diff --git a/lapack-netlib/INSTALL/secondtst.c b/lapack-netlib/INSTALL/secondtst.c index 694679bb5..03e7814e9 100644 --- a/lapack-netlib/INSTALL/secondtst.c +++ b/lapack-netlib/INSTALL/secondtst.c @@ -422,7 +422,7 @@ static integer c__1000 = 1000; /* ===================================================================== */ -/* Main program */ main(void) +/* Main program */ int main(void) { /* Format strings */ static char fmt_9999[] = "(\002 Time for \002,g10.3,\002 SAXPY ops = " diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index f510c8c80..28f8ad655 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -3323,68 +3323,107 @@ void LAPACK_zgesdd_base( #define LAPACK_zgesdd(...) LAPACK_zgesdd_base(__VA_ARGS__) #endif -#define LAPACK_cgedmd LAPACK_GLOBAL(cgedmd,CGEDMD) -void LAPACK_cgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_cgedmd_base LAPACK_GLOBAL(cgedmd,CGEDMD) +void LAPACK_cgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, lapack_complex_float* x, lapack_int const* ldx, - lapack_complex_float* y, lapack_int const* ldy, lapack_int const* k, - lapack_complex_float* reig, lapack_complex_float* imeig, - lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk, + const float* tol, lapack_int* k, lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int const* ldz, float* res, lapack_complex_float* b, lapack_int const* ldb, lapack_complex_float* w, lapack_int const* ldw, lapack_complex_float* s, lapack_int const* lds, - lapack_complex_float* work, lapack_int const* lwork, + lapack_complex_float* zwork, lapack_int const* lzwork, + float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_cgedmd(...) LAPACK_cgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_dgedmd LAPACK_GLOBAL(dgedmd,DGEDMD) -void LAPACK_dgedmd( - char const* jobs, char const* jobz, char const* jobf, + +#define LAPACK_dgedmd_base LAPACK_GLOBAL(dgedmd,DGEDMD) +void LAPACK_dgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, double* x, lapack_int const* ldx, - double* y, lapack_int const* ldy, lapack_int const* k, - double* reig, double* imeig, + double* y, lapack_int const* ldy, lapack_int const* nrnk, + const double* tol, lapack_int* k, double* reig, double* imeig, double* z, lapack_int const* ldz, double* res, double* b, lapack_int const* ldb, double* w, lapack_int const* ldw, double* s, lapack_int const* lds, double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_dgedmd(...) LAPACK_dgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_sgedmd LAPACK_GLOBAL(sgedmd,SGEDMD) -void LAPACK_sgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_sgedmd_base LAPACK_GLOBAL(sgedmd,SGEDMD) +void LAPACK_sgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, float* x, lapack_int const* ldx, - float* y, lapack_int const* ldy, lapack_int const* k, - float* reig, float* imeig, + float* y, lapack_int const* ldy, lapack_int const* nrnk, + const float* tol, lapack_int* k, float* reig, float *imeig, float* z, lapack_int const* ldz, float* res, float* b, lapack_int const* ldb, float* w, lapack_int const* ldw, float* s, lapack_int const* lds, float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_sgedmd(...) LAPACK_sgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_zgedmd LAPACK_GLOBAL(zgedmd,ZGEDMD) -void LAPACK_zgedmd( - char const* jobs, char const* jobz, char const* jobf, +#define LAPACK_zgedmd_base LAPACK_GLOBAL(zgedmd,ZGEDMD) +void LAPACK_zgedmd_base( + char const* jobs, char const* jobz, char const* jobr, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, lapack_complex_double* x, lapack_int const* ldx, - lapack_complex_double* y, lapack_int const* ldy, lapack_int const* k, - lapack_complex_double* reig, lapack_complex_double* imeig, - lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk, + const double* tol, lapack_int *k, lapack_complex_double* eigs, + lapack_complex_double* z, lapack_int const* ldz, double* res, lapack_complex_double* b, lapack_int const* ldb, lapack_complex_double* w, lapack_int const* ldw, lapack_complex_double* s, lapack_int const* lds, - lapack_complex_double* work, lapack_int const* lwork, + lapack_complex_double* zwork, lapack_int const* lzwork, + double* rwork, lapack_int const* lrwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__, 1, 1, 1, 1) +#else + #define LAPACK_zgedmd(...) LAPACK_zgedmd_base(__VA_ARGS__) +#endif -#define LAPACK_cgedmdq LAPACK_GLOBAL(cgedmdq,CGEDMDQ) -void LAPACK_cgedmdq( +#define LAPACK_cgedmdq_base LAPACK_GLOBAL(cgedmdq,CGEDMDQ) +void LAPACK_cgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3392,35 +3431,54 @@ void LAPACK_cgedmdq( lapack_complex_float* x, lapack_int const* ldx, lapack_complex_float* y, lapack_int const* ldy, lapack_int const* nrnk, float const* tol, lapack_int const* k, - lapack_complex_float* reig, lapack_complex_float* imeig, - lapack_complex_float* z, lapack_int const* ldz, lapack_complex_float* res, + lapack_complex_float* eigs, + lapack_complex_float* z, lapack_int const* ldz, float* res, lapack_complex_float* b, lapack_int const* ldb, lapack_complex_float* v, lapack_int const* ldv, lapack_complex_float* s, lapack_int const* lds, - lapack_complex_float* work, lapack_int const* lwork, + lapack_complex_float* zwork, lapack_int const* lzwork, + float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_cgedmdq(...) LAPACK_cgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_dgedmdq LAPACK_GLOBAL(dgedmdq,DGEDMDQ) -void LAPACK_dgedmdq( +#define LAPACK_dgedmdq_base LAPACK_GLOBAL(dgedmdq,DGEDMDQ) +void LAPACK_dgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, double* f, lapack_int const* ldf, double* x, lapack_int const* ldx, double* y, lapack_int const* ldy, lapack_int const* nrnk, - double const* tol, lapack_int const* k, - double* reig, double* imeig, + double const* tol, lapack_int* k, + double* reig, double *imeig, double* z, lapack_int const* ldz, double* res, double* b, lapack_int const* ldb, double* v, lapack_int const* ldv, double* s, lapack_int const* lds, double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_dgedmdq(...) LAPACK_dgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_sgedmdq LAPACK_GLOBAL(sgedmdq,SGEDMDQ) -void LAPACK_sgedmdq( +#define LAPACK_sgedmdq_base LAPACK_GLOBAL(sgedmdq,SGEDMDQ) +void LAPACK_sgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3435,10 +3493,19 @@ void LAPACK_sgedmdq( float* s, lapack_int const* lds, float* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_sgedmdq(...) LAPACK_sgedmdq_base(__VA_ARGS__) +#endif -#define LAPACK_zgedmdq LAPACK_GLOBAL(zgedmdq,ZGEDMDQ) -void LAPACK_zgedmdq( +#define LAPACK_zgedmdq_base LAPACK_GLOBAL(zgedmdq,ZGEDMDQ) +void LAPACK_zgedmdq_base( char const* jobs, char const* jobz, char const* jobr, char const* jobq, char const* jobt, char const* jobf, lapack_int const* whtsvd, lapack_int const* m, lapack_int const* n, @@ -3446,14 +3513,25 @@ void LAPACK_zgedmdq( lapack_complex_double* x, lapack_int const* ldx, lapack_complex_double* y, lapack_int const* ldy, lapack_int const* nrnk, double const* tol, lapack_int const* k, - lapack_complex_double* reig, lapack_complex_double* imeig, - lapack_complex_double* z, lapack_int const* ldz, lapack_complex_double* res, + lapack_complex_double* eigs, + lapack_complex_double* z, lapack_int const* ldz, double* res, lapack_complex_double* b, lapack_int const* ldb, lapack_complex_double* v, lapack_int const* ldv, lapack_complex_double* s, lapack_int const* lds, - lapack_complex_double* work, lapack_int const* lwork, + lapack_complex_double* zwork, lapack_int const* lzwork, + double* work, lapack_int const* lwork, lapack_int* iwork, lapack_int const* liwork, - lapack_int* info ); + lapack_int* info + +#ifdef LAPACK_FORTRAN_STRLEN_END + , size_t, size_t, size_t, size_t, size_t, size_t +#endif +); +#ifdef LAPACK_FORTRAN_STRLEN_END + #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__, 1, 1, 1, 1, 1, 1) +#else + #define LAPACK_zgedmdq(...) LAPACK_zgedmdq_base(__VA_ARGS__) +#endif #define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV) lapack_int LAPACK_cgesv( @@ -21649,7 +21727,7 @@ void LAPACK_ztrevc_base( #endif ); #ifdef LAPACK_FORTRAN_STRLEN_END - #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, 1, 1) + #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__, (size_t)1, 1) #else #define LAPACK_ztrevc(...) LAPACK_ztrevc_base(__VA_ARGS__) #endif diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 9a9ab4753..377e2a6bb 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -5713,55 +5713,59 @@ lapack_int LAPACKE_zgesdd_work( int matrix_layout, char jobz, lapack_int m, double* rwork, lapack_int* iwork ); lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, float* x, lapack_int ldx, - float* y, lapack_int ldy, lapack_int k, - float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, - lapack_int ldb, float* w, lapack_int ldw, - float* s, lapack_int lds, float* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ); + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, + lapack_int ldw, float* s, lapack_int lds, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, double* x, lapack_int ldx, - double* y, lapack_int ldy, lapack_int k, - double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, - lapack_int ldb, double* w, lapack_int ldw, - double* s, lapack_int lds, double* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ); + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double *imeig, + double* z, lapack_int ldz, double* res, + double* b, lapack_int ldb, double* w, + lapack_int ldw, double* s, lapack_int lds, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_float* x, - lapack_int ldx, lapack_complex_float* y, - lapack_int ldy, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_float* x, lapack_int ldx, + lapack_complex_float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_double* x, - lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_double* x, lapack_int ldx, + lapack_complex_double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ); lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, @@ -5769,8 +5773,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - float* reig, float* imeig, float* z, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float *imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, float* s, lapack_int lds, float* work, @@ -5782,8 +5786,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - double* reig, double* imeig, double* z, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, double* s, lapack_int lds, double* work, @@ -5796,17 +5800,16 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, - lapack_int* iwork, - lapack_int liwork ); + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork); lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, @@ -5814,17 +5817,16 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, - lapack_int* iwork, - lapack_int liwork ); + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork); lapack_int LAPACKE_sgesv_work( int matrix_layout, lapack_int n, lapack_int nrhs, float* a, lapack_int lda, lapack_int* ipiv, diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c index a269b0daf..6c77e199e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd.c @@ -32,22 +32,26 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - lapack_complex_float* x, lapack_int ldx, - lapack_complex_float* y, lapack_int ldy, lapack_int k, - lapack_complex_float* reig, lapack_complex_float* imeig, +lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_float* x, + lapack_int ldx, lapack_complex_float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_float* work = NULL; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; lapack_int* iwork = NULL; - lapack_complex_float work_query; + lapack_complex_float zwork_query; + float work_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgedmd", -1 ); @@ -77,36 +81,44 @@ lapack_int LAPACKE_cgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, + lzwork, &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } + lzwork = LAPACK_C2INT( zwork_query ); lwork = LAPACK_C2INT( work_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ - info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_cgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_cgedmd", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c index 534934efb..08d8b91f5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmd_work.c @@ -33,23 +33,25 @@ #include "lapacke_utils.h" lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, + char jobr, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, lapack_complex_float* x, lapack_int ldx, - lapack_complex_float* y, lapack_int ldy, lapack_int k, - lapack_complex_float* reig, lapack_complex_float* imeig, + lapack_complex_float* y, lapack_int ldy, lapack_int nrnk, + float* tol, lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* w, lapack_int ldw, lapack_complex_float* s, lapack_int lds, - lapack_complex_float* work, lapack_int lwork, + lapack_complex_float* zwork, lapack_int lzwork, + float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, zwork, &lzwork, work, &lwork, iwork, &liwork, + &info ); if( info < 0 ) { info = info - 1; } @@ -99,9 +101,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, + work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -143,9 +146,10 @@ lapack_int LAPACKE_cgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_cge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_cgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_cgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c index 60e83729b..b0b258f97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq.c @@ -37,20 +37,22 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, lapack_int m, lapack_int n, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, - lapack_int ldy, lapack_int nrnk, float tol, - lapack_int k, lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, lapack_complex_float* eigs, lapack_complex_float* z, lapack_int ldz, - lapack_complex_float* res, lapack_complex_float* b, + float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_float* work = NULL; + lapack_int lzwork = -1; + lapack_complex_float* zwork = NULL; + float* work = NULL; lapack_int* iwork = NULL; - lapack_complex_float work_query; + lapack_complex_float zwork_query; + float work_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); @@ -85,36 +87,44 @@ lapack_int LAPACKE_cgedmdq( int matrix_layout, char jobs, char jobz, char jobr, /* Query optimal working array(s) size */ info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, &work_query, lwork, - &iwork_query, liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } + lzwork = LAPACK_C2INT( zwork_query ); lwork = LAPACK_C2INT( work_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ info = LAPACKE_cgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, work, lwork, iwork, - liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_cgedmdq", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c index 5bdbd3f56..05287c1bc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgedmdq_work.c @@ -38,15 +38,15 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_float* f, lapack_int ldf, lapack_complex_float* x, lapack_int ldx, lapack_complex_float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, - lapack_complex_float* reig, - lapack_complex_float* imeig, + lapack_int nrnk, float* tol, lapack_int k, + lapack_complex_float* eigs, lapack_complex_float* z, - lapack_int ldz, lapack_complex_float* res, + lapack_int ldz, float* res, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* v, lapack_int ldv, lapack_complex_float* s, - lapack_int lds, lapack_complex_float* work, + lapack_int lds, lapack_complex_float *zwork, + lapack_int lzwork, float* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { @@ -54,9 +54,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -112,11 +112,11 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, return info; } /* Query optimal working array(s) size if requested */ - if( lwork == -1 || liwork == -1 ) { + if( lzwork == -1 || lwork == -1 || liwork == -1 ) { LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -165,9 +165,9 @@ lapack_int LAPACKE_cgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_cge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_cgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c index 246d7f649..6802378da 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd.c @@ -32,11 +32,13 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int k, double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, lapack_int ldb, +lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, double* x, lapack_int ldx, double* y, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, double* reig, double* imeig, + double* z, lapack_int ldz, + double* res, double* b, lapack_int ldb, double* w, lapack_int ldw, double* s, lapack_int lds) { lapack_int info = 0; @@ -74,10 +76,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; @@ -96,10 +98,10 @@ lapack_int LAPACKE_dgedmd( int matrix_layout, char jobs, char jobz, char jobf, goto exit_level_1; } /* Call middle-level interface */ - info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_dgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); exit_level_1: diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c index 4d1169de9..987709a1b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmd_work.c @@ -33,22 +33,23 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, double* x, lapack_int ldx, - double* y, lapack_int ldy, lapack_int k, - double* reig, double* imeig, double* z, - lapack_int ldz, double* res, double* b, - lapack_int ldb, double* w, lapack_int ldw, - double* s, lapack_int lds, double* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ) + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, double* x, + lapack_int ldx, double* y, lapack_int ldy, + lapack_int nrnk, double* tol, lapack_int k, + double* reig, double* imeig, + double* z, lapack_int ldz, double* res, + double* b, lapack_int ldb, double* w, + lapack_int ldw, double* s, lapack_int lds, + double* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -98,9 +99,9 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -142,9 +143,10 @@ lapack_int LAPACKE_dgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_dge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_dgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_dgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, + y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, res, b_t, + &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, &lwork, + iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c index f3d621ba9..5c3c39308 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq.c @@ -36,7 +36,7 @@ lapack_int LAPACKE_dgedmdq( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, + lapack_int nrnk, double* tol, lapack_int k, double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, double* s, lapack_int lds) diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c index 51b2a66d8..149e6d24f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgedmdq_work.c @@ -37,7 +37,7 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, double* f, lapack_int ldf, double* x, lapack_int ldx, double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, + lapack_int nrnk, double* tol, lapack_int k, double* reig, double* imeig, double* z, lapack_int ldz, double* res, double* b, lapack_int ldb, double* v, lapack_int ldv, @@ -49,8 +49,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; @@ -109,8 +109,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } @@ -160,8 +160,8 @@ lapack_int LAPACKE_dgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_dge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_dgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c index 879631b1d..6865fcf65 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd.c @@ -32,12 +32,14 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int k, float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, lapack_int ldb, - float* w, lapack_int ldw, float* s, lapack_int lds) +lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, float* x, lapack_int ldx, float* y, + lapack_int ldy, lapack_int nrnk, float* tol, + lapack_int k, float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, lapack_int ldw, + float* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; @@ -74,10 +76,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, &work_query, + lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; @@ -96,10 +98,10 @@ lapack_int LAPACKE_sgedmd( int matrix_layout, char jobs, char jobz, char jobf, goto exit_level_1; } /* Call middle-level interface */ - info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_sgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, reig, imeig, z, ldz, + res, b, ldb, w, ldw, s, lds, work, lwork, + iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); exit_level_1: diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c index 762a9b271..5b24152da 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmd_work.c @@ -33,22 +33,23 @@ #include "lapacke_utils.h" lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, float* x, lapack_int ldx, - float* y, lapack_int ldy, lapack_int k, - float* reig, float* imeig, float* z, - lapack_int ldz, float* res, float* b, - lapack_int ldb, float* w, lapack_int ldw, - float* s, lapack_int lds, float* work, - lapack_int lwork, lapack_int* iwork, - lapack_int liwork ) + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, float* x, + lapack_int ldx, float* y, lapack_int ldy, + lapack_int nrnk, float* tol, lapack_int k, + float* reig, float* imeig, + float* z, lapack_int ldz, float* res, + float* b, lapack_int ldb, float* w, + lapack_int ldw, float* s, lapack_int lds, + float* work, lapack_int lwork, + lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, y, + &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, + s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -98,9 +99,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, work, &lwork, iwork, + &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -142,9 +144,10 @@ lapack_int LAPACKE_sgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_sge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_sgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_sgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, reig, imeig, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, work, + &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c index e202d7fbd..e65c2094f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq.c @@ -36,7 +36,7 @@ lapack_int LAPACKE_sgedmdq( int matrix_layout, char jobs, char jobz, char jobr, char jobq, char jobt, char jobf, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, + lapack_int nrnk, float* tol, lapack_int k, float* reig, float* imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, float* s, lapack_int lds) diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c index 9039898d2..e1c1f5c98 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgedmdq_work.c @@ -37,7 +37,7 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_int whtsvd, lapack_int m, lapack_int n, float* f, lapack_int ldf, float* x, lapack_int ldx, float* y, lapack_int ldy, - lapack_int nrnk, float tol, lapack_int k, + lapack_int nrnk, float* tol, lapack_int k, float* reig, float* imeig, float* z, lapack_int ldz, float* res, float* b, lapack_int ldb, float* v, lapack_int ldv, @@ -49,8 +49,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; @@ -109,8 +109,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } @@ -160,8 +160,8 @@ lapack_int LAPACKE_sgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_sge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_sgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, reig, imeig, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c index f3f421c54..e4ea4fe10 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd.c @@ -32,24 +32,28 @@ #include "lapacke_utils.h" -lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf, - lapack_int whtsvd, lapack_int m, lapack_int n, - lapack_complex_double* x, lapack_int ldx, - lapack_complex_double* y, lapack_int ldy, - lapack_int k, lapack_complex_double* reig, - lapack_complex_double* imeig, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, +lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobr, + char jobf, lapack_int whtsvd, lapack_int m, + lapack_int n, lapack_complex_double* x, + lapack_int ldx, lapack_complex_double* y, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, + lapack_complex_double* zw, lapack_int lzw, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_double* work = NULL; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; lapack_int* iwork = NULL; - lapack_complex_double work_query; + double work_query; lapack_int iwork_query; + lapack_complex_double zwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { LAPACKE_xerbla( "LAPACKE_zgedmd", -1 ); return -1; @@ -78,36 +82,44 @@ lapack_int LAPACKE_zgedmd( int matrix_layout, char jobs, char jobz, char jobf, } #endif /* Query optimal working array(s) size */ - info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, &work_query, lwork, - &iwork_query, liwork ); + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } lwork = LAPACK_Z2INT( work_query ); liwork = iwork_query; + lzwork = LAPACK_Z2INT( zwork_query ); /* Allocate memory for work arrays */ - work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ - info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobf, whtsvd, m, n, - x, ldx, y, ldy, k, reig, imeig, z, ldz, res, - b, ldb, w, ldw, s, lds, work, lwork, iwork, - liwork ); + info = LAPACKE_zgedmd_work( matrix_layout, jobs, jobz, jobr, jobf, whtsvd, + m, n, x, ldx, y, ldy, nrnk, tol, k, eigs, z, ldz, + res, b, ldb, w, ldw, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zgedmd", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c index 2554411ec..ebacfaa94 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmd_work.c @@ -33,25 +33,27 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, - char jobf, lapack_int whtsvd, lapack_int m, - lapack_int n, lapack_complex_double* x, + char jobr, char jobf, lapack_int whtsvd, + lapack_int m, lapack_int n, + lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, + lapack_int ldy, lapack_int nrnk, double *tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* w, lapack_int ldw, lapack_complex_double* s, lapack_int lds, - lapack_complex_double* work, lapack_int lwork, + lapack_complex_double* zwork, lapack_int lzwork, + double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { lapack_int info = 0; if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, &ldx, + y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, &ldb, w, + &ldw, s, &lds, zwork, &lzwork, work, &lwork, iwork, + &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -101,9 +103,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, } /* Query optimal working array(s) size if requested */ if( lwork == -1 ) { - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x, &ldx, y, &ldy, - &k, reig, imeig, z, &ldz, res, b, &ldb, w, &ldw, s, &lds, - work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x, + &ldx, y, &ldy, &nrnk, tol, &k, eigs, z, &ldz, res, b, + &ldb, w, &ldw, s, &lds, zwork, &lzwork, work, + &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -145,9 +148,10 @@ lapack_int LAPACKE_zgedmd_work( int matrix_layout, char jobs, char jobz, LAPACKE_zge_trans( matrix_layout, m, n, w, ldw, w_t, ldw_t ); LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ - LAPACK_zgedmd( &jobs, &jobz, &jobf, &whtsvd, &m, &n, x_t, &ldx_t, y_t, - &ldy_t, &k, reig, imeig, z_t, &ldz_t, res, b_t, &ldb_t, - w_t, &ldw_t, s_t, &lds_t, work, &lwork, iwork, &liwork, &info ); + LAPACK_zgedmd( &jobs, &jobz, &jobr, &jobf, &whtsvd, &m, &n, x_t, + &ldx_t, y_t, &ldy_t, &nrnk, tol, &k, eigs, z_t, &ldz_t, + res, b_t, &ldb_t, w_t, &ldw_t, s_t, &lds_t, zwork, + &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c index 3648ffdf2..368d48e20 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq.c @@ -37,23 +37,25 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, lapack_int m, lapack_int n, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, - lapack_int ldy, lapack_int nrnk, double tol, - lapack_int k, lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int ldy, lapack_int nrnk, double* tol, + lapack_int k, lapack_complex_double* eigs, lapack_complex_double* z, lapack_int ldz, - lapack_complex_double* res, lapack_complex_double* b, + double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, lapack_int lds) { lapack_int info = 0; lapack_int lwork = -1; lapack_int liwork = -1; - lapack_complex_double* work = NULL; + lapack_int lzwork = -1; + lapack_complex_double* zwork = NULL; + double* work = NULL; lapack_int* iwork = NULL; - lapack_complex_double work_query; + double work_query; + lapack_complex_double zwork_query; lapack_int iwork_query; if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { - LAPACKE_xerbla( "LAPACKE_cgedmdq", -1 ); + LAPACKE_xerbla( "LAPACKE_zgedmdq", -1 ); return -1; } #ifndef LAPACK_DISABLE_NAN_CHECK @@ -85,36 +87,44 @@ lapack_int LAPACKE_zgedmdq( int matrix_layout, char jobs, char jobz, char jobr, /* Query optimal working array(s) size */ info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, &work_query, lwork, - &iwork_query, liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, &zwork_query, lzwork, + &work_query, lwork, &iwork_query, liwork ); if( info != 0 ) { goto exit_level_0; } lwork = LAPACK_Z2INT( work_query ); + lzwork = LAPACK_Z2INT( zwork_query ); liwork = iwork_query; /* Allocate memory for work arrays */ - work = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); - if( work == NULL ) { + zwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lzwork ); + if( zwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; goto exit_level_0; } + work = (double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_1; + } iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork ); if( iwork == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_2; } /* Call middle-level interface */ info = LAPACKE_zgedmdq_work( matrix_layout, jobs, jobz, jobr, jobq, jobt, jobf, whtsvd, m, n, f, ldf, x, ldx, y, ldy, - nrnk, tol, k, reig, imeig, z, ldz, res, - b, ldb, v, ldv, s, lds, work, lwork, iwork, - liwork ); + nrnk, tol, k, eigs, z, ldz, res, + b, ldb, v, ldv, s, lds, zwork, lzwork, + work, lwork, iwork, liwork ); /* Release memory and exit */ LAPACKE_free( iwork ); -exit_level_1: +exit_level_2: LAPACKE_free( work ); +exit_level_1: + LAPACKE_free( zwork ); exit_level_0: if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zgedmdq", info ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c index 9afceba07..131e4f9ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgedmdq_work.c @@ -38,15 +38,15 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, lapack_complex_double* f, lapack_int ldf, lapack_complex_double* x, lapack_int ldx, lapack_complex_double* y, lapack_int ldy, - lapack_int nrnk, double tol, lapack_int k, - lapack_complex_double* reig, - lapack_complex_double* imeig, + lapack_int nrnk, double* tol, lapack_int k, + lapack_complex_double* eigs, lapack_complex_double* z, - lapack_int ldz, lapack_complex_double* res, + lapack_int ldz, double* res, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* v, lapack_int ldv, lapack_complex_double* s, - lapack_int lds, lapack_complex_double* work, + lapack_int lds, lapack_complex_double* zwork, + lapack_int lzwork, double* work, lapack_int lwork, lapack_int* iwork, lapack_int liwork ) { @@ -54,9 +54,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } @@ -114,9 +114,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, /* Query optimal working array(s) size if requested */ if( lwork == -1 || liwork == -1 ) { LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); return (info < 0) ? (info - 1) : info; } /* Allocate memory for temporary array(s) */ @@ -165,9 +165,9 @@ lapack_int LAPACKE_zgedmdq_work( int matrix_layout, char jobs, char jobz, LAPACKE_zge_trans( matrix_layout, m, n, s, lds, s_t, lds_t ); /* Call LAPACK function and adjust info */ LAPACK_zgedmdq( &jobs, &jobz, &jobr, &jobq, &jobt, &jobf, &whtsvd, &m, - &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, &tol, &k, reig, - imeig, z, &ldz, res, b, &ldb, v, &ldv, s, &lds, - work, &lwork, iwork, &liwork, &info ); + &n, f, &ldf, x, &ldx, y, &ldy, &nrnk, tol, &k, eigs, + z, &ldz, res, b, &ldb, v, &ldv, s, &lds, + zwork, &lzwork, work, &lwork, iwork, &liwork, &info ); if( info < 0 ) { info = info - 1; } diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f index 1dfbd3f2b..553087bf4 100644 --- a/lapack-netlib/SRC/cgelqt3.f +++ b/lapack-netlib/SRC/cgelqt3.f @@ -159,7 +159,8 @@ * * Compute Householder transform when M=1 * - CALL CLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL CLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f index ebf228f18..951196b83 100644 --- a/lapack-netlib/SRC/csytrf.f +++ b/lapack-netlib/SRC/csytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/csytrf_rk.f b/lapack-netlib/SRC/csytrf_rk.f index 9c2b7182f..996801e7d 100644 --- a/lapack-netlib/SRC/csytrf_rk.f +++ b/lapack-netlib/SRC/csytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'CSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ctrevc3.f b/lapack-netlib/SRC/ctrevc3.f index 0f58696b2..11b32104d 100644 --- a/lapack-netlib/SRC/ctrevc3.f +++ b/lapack-netlib/SRC/ctrevc3.f @@ -321,9 +321,9 @@ * INFO = 0 NB = ILAENV( 1, 'CTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK - RWORK(1) = N + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 diff --git a/lapack-netlib/SRC/dgelqt3.f b/lapack-netlib/SRC/dgelqt3.f index 5bcc06a80..ee3bdceb4 100644 --- a/lapack-netlib/SRC/dgelqt3.f +++ b/lapack-netlib/SRC/dgelqt3.f @@ -173,7 +173,8 @@ * * Compute Householder transform when M=1 * - CALL DLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL DLARFG( N, A ( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1) ) * ELSE * diff --git a/lapack-netlib/SRC/dlatrs.f b/lapack-netlib/SRC/dlatrs.f index be156bee2..b282f4227 100644 --- a/lapack-netlib/SRC/dlatrs.f +++ b/lapack-netlib/SRC/dlatrs.f @@ -261,6 +261,9 @@ DOUBLE PRECISION BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + DOUBLE PRECISION WORK(1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER IDAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( DLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( DLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f index a39b03283..aee9b3f6a 100644 --- a/lapack-netlib/SRC/dsytrf.f +++ b/lapack-netlib/SRC/dsytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dsytrf_rk.f b/lapack-netlib/SRC/dsytrf_rk.f index 7341b9263..086586968 100644 --- a/lapack-netlib/SRC/dsytrf_rk.f +++ b/lapack-netlib/SRC/dsytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'DSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/dtrevc3.f b/lapack-netlib/SRC/dtrevc3.f index a4651e788..c8c04ad13 100644 --- a/lapack-netlib/SRC/dtrevc3.f +++ b/lapack-netlib/SRC/dtrevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'DTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/dtrsyl3.f b/lapack-netlib/SRC/dtrsyl3.f index c44ec3808..31a5230ba 100644 --- a/lapack-netlib/SRC/dtrsyl3.f +++ b/lapack-netlib/SRC/dtrsyl3.f @@ -1220,7 +1220,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL DLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f index 23816b4c8..82f5c1cf4 100644 --- a/lapack-netlib/SRC/sgelqt3.f +++ b/lapack-netlib/SRC/sgelqt3.f @@ -158,7 +158,8 @@ * * Compute Householder transform when M=1 * - CALL SLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL SLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) * ELSE * diff --git a/lapack-netlib/SRC/slatrs.f b/lapack-netlib/SRC/slatrs.f index 0761d656f..9765ea3d7 100644 --- a/lapack-netlib/SRC/slatrs.f +++ b/lapack-netlib/SRC/slatrs.f @@ -261,6 +261,9 @@ REAL BIGNUM, GROW, REC, SMLNUM, SUMJ, TJJ, TJJS, $ TMAX, TSCAL, USCAL, XBND, XJ, XMAX * .. +* .. Local Arrays .. + REAL WORK (1) +* .. * .. External Functions .. LOGICAL LSAME INTEGER ISAMAX @@ -362,7 +365,7 @@ * A is upper triangular. * DO J = 2, N - TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, SUMJ ), + TMAX = MAX( SLANGE( 'M', J-1, 1, A( 1, J ), 1, WORK ), $ TMAX ) END DO ELSE @@ -371,7 +374,7 @@ * DO J = 1, N - 1 TMAX = MAX( SLANGE( 'M', N-J, 1, A( J+1, J ), 1, - $ SUMJ ), TMAX ) + $ WORK ), TMAX ) END DO END IF * diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f index d188589b9..31e38e466 100644 --- a/lapack-netlib/SRC/ssytrf.f +++ b/lapack-netlib/SRC/ssytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ssytrf_rk.f b/lapack-netlib/SRC/ssytrf_rk.f index ec84fcb1b..8e1ef460a 100644 --- a/lapack-netlib/SRC/ssytrf_rk.f +++ b/lapack-netlib/SRC/ssytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'SSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/strevc3.f b/lapack-netlib/SRC/strevc3.f index 5af57123b..253cbc24c 100644 --- a/lapack-netlib/SRC/strevc3.f +++ b/lapack-netlib/SRC/strevc3.f @@ -298,7 +298,7 @@ * INFO = 0 NB = ILAENV( 1, 'STREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK LQUERY = ( LWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN diff --git a/lapack-netlib/SRC/strsyl3.f b/lapack-netlib/SRC/strsyl3.f index 28762c2ed..ef3f2da83 100644 --- a/lapack-netlib/SRC/strsyl3.f +++ b/lapack-netlib/SRC/strsyl3.f @@ -1223,7 +1223,7 @@ * SCALOC = MIN( BIGNUM / SCAL, ONE / BUF ) BUF = BUF * SCALOC - CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK ) + CALL SLASCL( 'G', -1, -1, ONE, SCALOC, M, N, C, LDC, IWORK(1) ) END IF * * Combine with buffer scaling factor. SCALE will be flushed if diff --git a/lapack-netlib/SRC/zgelqt3.f b/lapack-netlib/SRC/zgelqt3.f index 629a09472..1a71dc44e 100644 --- a/lapack-netlib/SRC/zgelqt3.f +++ b/lapack-netlib/SRC/zgelqt3.f @@ -174,7 +174,8 @@ * * Compute Householder transform when M=1 * - CALL ZLARFG( N, A, A( 1, MIN( 2, N ) ), LDA, T ) + CALL ZLARFG( N, A( 1, 1 ), A( 1, MIN( 2, N ) ), LDA, + & T( 1, 1 ) ) T(1,1)=CONJG(T(1,1)) * ELSE diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f index a775a8758..dc9016c69 100644 --- a/lapack-netlib/SRC/zsytrf.f +++ b/lapack-netlib/SRC/zsytrf.f @@ -232,7 +232,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/zsytrf_rk.f b/lapack-netlib/SRC/zsytrf_rk.f index 3b398ce6c..af8b8d501 100644 --- a/lapack-netlib/SRC/zsytrf_rk.f +++ b/lapack-netlib/SRC/zsytrf_rk.f @@ -310,7 +310,7 @@ * Determine the block size * NB = ILAENV( 1, 'ZSYTRF_RK', UPLO, N, -1, -1, -1 ) - LWKOPT = N*NB + LWKOPT = MAX( 1, N*NB ) WORK( 1 ) = LWKOPT END IF * diff --git a/lapack-netlib/SRC/ztrevc3.f b/lapack-netlib/SRC/ztrevc3.f index 6300e80ae..8fb144e0c 100644 --- a/lapack-netlib/SRC/ztrevc3.f +++ b/lapack-netlib/SRC/ztrevc3.f @@ -321,9 +321,9 @@ * INFO = 0 NB = ILAENV( 1, 'ZTREVC', SIDE // HOWMNY, N, -1, -1, -1 ) - MAXWRK = N + 2*N*NB + MAXWRK = MAX( 1, N + 2*N*NB ) WORK(1) = MAXWRK - RWORK(1) = N + RWORK(1) = MAX( 1, N ) LQUERY = ( LWORK.EQ.-1 .OR. LRWORK.EQ.-1 ) IF( .NOT.RIGHTV .AND. .NOT.LEFTV ) THEN INFO = -1 diff --git a/lapack-netlib/TESTING/EIG/cerrst.f b/lapack-netlib/TESTING/EIG/cerrst.f index ba97afbe5..1748a2aad 100644 --- a/lapack-netlib/TESTING/EIG/cerrst.f +++ b/lapack-netlib/TESTING/EIG/cerrst.f @@ -160,13 +160,13 @@ * SRNAMT = 'CHETD2' INFOT = 1 - CALL CHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL CHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'CHETD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/derrst.f b/lapack-netlib/TESTING/EIG/derrst.f index a55b6eea9..059538644 100644 --- a/lapack-netlib/TESTING/EIG/derrst.f +++ b/lapack-netlib/TESTING/EIG/derrst.f @@ -161,13 +161,13 @@ * SRNAMT = 'DSYTD2' INFOT = 1 - CALL DSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL DSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'DSYTD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/serrst.f b/lapack-netlib/TESTING/EIG/serrst.f index 8c9c0f306..b87fc42ef 100644 --- a/lapack-netlib/TESTING/EIG/serrst.f +++ b/lapack-netlib/TESTING/EIG/serrst.f @@ -161,13 +161,13 @@ * SRNAMT = 'SSYTD2' INFOT = 1 - CALL SSYTD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL SSYTD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'SSYTD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/EIG/zerrst.f b/lapack-netlib/TESTING/EIG/zerrst.f index 948f94bc2..d7b41c053 100644 --- a/lapack-netlib/TESTING/EIG/zerrst.f +++ b/lapack-netlib/TESTING/EIG/zerrst.f @@ -160,13 +160,13 @@ * SRNAMT = 'ZHETD2' INFOT = 1 - CALL ZHETD2( '/', 0, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( '/', 0, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( 'U', -1, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, W, 1, INFO ) + CALL ZHETD2( 'U', 2, A, 1, D, E, TAU, INFO ) CALL CHKXER( 'ZHETD2', INFOT, NOUT, LERR, OK ) NT = NT + 3 * diff --git a/lapack-netlib/TESTING/LIN/cchktr.f b/lapack-netlib/TESTING/LIN/cchktr.f index 4b09361d8..2953a2bd5 100644 --- a/lapack-netlib/TESTING/LIN/cchktr.f +++ b/lapack-netlib/TESTING/LIN/cchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - REAL RESULT( NTESTS ), SCALE3( 2 ) + REAL RESULT( NTESTS ), RWORK2( 2*NMAX ), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -542,10 +543,10 @@ SRNAMT = 'CLATRS3' CALL CCOPY( N, X, 1, B, 1 ) CALL CCOPY( N, X, 1, B( N+1 ), 1 ) - CALL CSCAL( N, BIGNUM, B( N+1 ), 1 ) + CALL CSSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL CLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from CLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/cerrtr.f b/lapack-netlib/TESTING/LIN/cerrtr.f index 9ba784f62..ab83357f8 100644 --- a/lapack-netlib/TESTING/LIN/cerrtr.f +++ b/lapack-netlib/TESTING/LIN/cerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. REAL R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'CLATRS3' INFOT = 1 - CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL CLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'CLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. diff --git a/lapack-netlib/TESTING/LIN/derrtr.f b/lapack-netlib/TESTING/LIN/derrtr.f index d0580497d..878d9070c 100644 --- a/lapack-netlib/TESTING/LIN/derrtr.f +++ b/lapack-netlib/TESTING/LIN/derrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'DLATRS3' INFOT = 1 - CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL DLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'DLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/serrtr.f b/lapack-netlib/TESTING/LIN/serrtr.f index af1ce0a8e..391b54c3f 100644 --- a/lapack-netlib/TESTING/LIN/serrtr.f +++ b/lapack-netlib/TESTING/LIN/serrtr.f @@ -71,7 +71,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - REAL RCOND, SCALE + REAL RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. INTEGER IW( NMAX ) @@ -250,40 +250,40 @@ * SRNAMT = 'SLATRS3' INFOT = 1 - CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, W, - $ W( 2 ), 1, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ W, W( 2 ), 1, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, W, - $ W( 2 ), 0, INFO ) + CALL SLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ W, W( 2 ), 0, INFO ) CALL CHKXER( 'SLATRS3', INFOT, NOUT, LERR, OK ) * ELSE IF( LSAMEN( 2, C2, 'TP' ) ) THEN diff --git a/lapack-netlib/TESTING/LIN/zchktr.f b/lapack-netlib/TESTING/LIN/zchktr.f index 275ca2857..4af538124 100644 --- a/lapack-netlib/TESTING/LIN/zchktr.f +++ b/lapack-netlib/TESTING/LIN/zchktr.f @@ -201,7 +201,8 @@ * .. Local Arrays .. CHARACTER TRANSS( NTRAN ), UPLOS( 2 ) INTEGER ISEED( 4 ), ISEEDY( 4 ) - DOUBLE PRECISION RESULT( NTESTS ), SCALE3( 2 ) + DOUBLE PRECISION RESULT( NTESTS ), RWORK2( 2*NMAX), + $ SCALE3( 2 ) * .. * .. External Functions .. LOGICAL LSAME @@ -544,8 +545,8 @@ CALL ZCOPY( N, X, 1, B( N+1 ), 1 ) CALL ZDSCAL( N, BIGNUM, B( N+1 ), 1 ) CALL ZLATRS3( UPLO, TRANS, DIAG, 'N', N, 2, A, LDA, - $ B, MAX(1, N), SCALE3, RWORK, WORK, NMAX, - $ INFO ) + $ B, MAX(1, N), SCALE3, RWORK, RWORK2, + $ 2*NMAX, INFO ) * * Check error code from ZLATRS3. * diff --git a/lapack-netlib/TESTING/LIN/zerrtr.f b/lapack-netlib/TESTING/LIN/zerrtr.f index 211b92154..640c39793 100644 --- a/lapack-netlib/TESTING/LIN/zerrtr.f +++ b/lapack-netlib/TESTING/LIN/zerrtr.f @@ -70,7 +70,7 @@ * .. Local Scalars .. CHARACTER*2 C2 INTEGER INFO - DOUBLE PRECISION RCOND, SCALE + DOUBLE PRECISION RCOND, SCALE, SCALES(0) * .. * .. Local Arrays .. DOUBLE PRECISION R1( NMAX ), R2( NMAX ), RW( NMAX ) @@ -245,40 +245,40 @@ * SRNAMT = 'ZLATRS3' INFOT = 1 - CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( '/', 'N', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 2 - CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', '/', 'N', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 3 - CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', '/', 'N', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 4 - CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', '/', 0, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 5 - CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', -1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 6 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 0, -1, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 8 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 10 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALE, RW, - $ RW( 2 ), 1, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 2, 0, A, 2, X, 1, SCALES, + $ RW, RW( 2 ), 1, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) INFOT = 14 - CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALE, RW, - $ RW( 2 ), 0, INFO ) + CALL ZLATRS3( 'U', 'N', 'N', 'N', 1, 0, A, 1, X, 1, SCALES, + $ RW, RW( 2 ), 0, INFO ) CALL CHKXER( 'ZLATRS3', INFOT, NOUT, LERR, OK ) * * Test error exits for the packed triangular routines. diff --git a/lapack/getf2/getf2_k.c b/lapack/getf2/getf2_k.c index 80c66dd7a..5795797d3 100644 --- a/lapack/getf2/getf2_k.c +++ b/lapack/getf2/getf2_k.c @@ -95,7 +95,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, a + j, lda, b, 1, b + j, 1, sb); jp = j + IAMAX_K(m - j, b + j, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; temp1 = *(b + jp); diff --git a/lapack/getf2/zgetf2_k.c b/lapack/getf2/zgetf2_k.c index e3d53c96f..6a2137b3e 100644 --- a/lapack/getf2/zgetf2_k.c +++ b/lapack/getf2/zgetf2_k.c @@ -99,7 +99,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, GEMV_N(m - j, j, 0, dm1, ZERO, a + j * 2, lda, b, 1, b + j * 2, 1, sb); jp = j + IAMAX_K(m - j, b + j * 2, 1); - if (jp>m) jp = m; //avoid out of boundary + if (jp>m) jp = m; //avoid out of boundary when the iamax kernel does not cope with NaN in input, see gh issue 723 ipiv[j + offset] = jp + offset; jp--; diff --git a/param.h b/param.h index aa193a284..03bf3624f 100644 --- a/param.h +++ b/param.h @@ -2845,31 +2845,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL +#if defined(NO_LASX) +#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 2 +#else #define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 8 +#define SGEMM_DEFAULT_UNROLL_M 16 +#endif + #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 -#define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 -#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_P 256 #define DGEMM_DEFAULT_P 32 #define CGEMM_DEFAULT_P 128 #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_R 12288 +#define SGEMM_DEFAULT_R 1024 #define DGEMM_DEFAULT_R 858 #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#define SGEMM_DEFAULT_Q 128 +#define SGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 152 #define CGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 128 @@ -3371,7 +3379,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(NEOVERSEV1) +#elif defined(NEOVERSEV1) // 256-bit SVE #if defined(XDOUBLE) || defined(DOUBLE) #define SWITCH_RATIO 8 @@ -3385,11 +3393,13 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define DGEMM_DEFAULT_UNROLL_M 4 // Actually 2VL (8) but kept separate to keep copies separate #define DGEMM_DEFAULT_UNROLL_N 8 -#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 #define SGEMM_DEFAULT_P 128 #define DGEMM_DEFAULT_P 160 @@ -3449,7 +3459,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 -#elif defined(ARMV8SVE) || defined(A64FX) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) +#elif defined(A64FX) // 512-bit SVE /* When all BLAS3 routines are implemeted with SVE, SGEMM_DEFAULT_UNROLL_M should be "sve_vl". Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy routines in both directions seperated. */ @@ -3490,6 +3500,43 @@ Until then, just keep it different than DGEMM_DEFAULT_UNROLL_N to keep copy rout #define CGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096 +#elif defined(ARMV8SVE) || defined(ARMV9) || defined(CORTEXA510)|| defined(CORTEXA710) || defined(CORTEXX2) // 128-bit SVE + +#if defined(XDOUBLE) || defined(DOUBLE) +#define SWITCH_RATIO 8 +#else +#define SWITCH_RATIO 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_M 4 // Actually 1VL (8) but kept seperate to keep copies seperate +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 8 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define CGEMM_DEFAULT_UNROLL_MN 16 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_MN 16 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 128 +#define ZGEMM_DEFAULT_P 128 + +#define SGEMM_DEFAULT_Q 352 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 112 + +#define SGEMM_DEFAULT_R 4096 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + #else /* Other/undetected ARMv8 cores */ #define SGEMM_DEFAULT_UNROLL_M 16 diff --git a/test/Makefile b/test/Makefile index 923f1537c..46a7b1158 100644 --- a/test/Makefile +++ b/test/Makefile @@ -265,7 +265,7 @@ FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) -CEXTRALIB = -lomp +CEXTRALIB += -lomp endif endif ifeq ($(F_COMPILER), NAG) diff --git a/utest/test_axpy.c b/utest/test_axpy.c index 5fd7c1b04..26005e70f 100644 --- a/utest/test_axpy.c +++ b/utest/test_axpy.c @@ -74,6 +74,26 @@ CTEST(axpy,zaxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,zaxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + double a[2]={0.25,0.5}; + double x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(zaxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} + #endif #ifdef BUILD_SINGLE @@ -116,5 +136,24 @@ CTEST(axpy,caxpy_inc_0) ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); } } + +CTEST(axpy,caxpy_incx_0) +{ + blasint i; + blasint N=4,incX=0,incY=1; + float a[2]={0.25,0.5}; + float x1[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + float y1[]={2.0,4.0,6.0,8.0,2.0,4.0,6.0,8.0}; + double x2[]={1.0,3.0,5.0,7.0,1.0,3.0,5.0,7.0}; + double y2[]={0.75,5.25,4.75,9.25,0.75,5.25,4.75,9.25}; + + //OpenBLAS + BLASFUNC(caxpy)(&N,a,x1,&incX,y1,&incY); + + for(i=0; i<2*N; i++){ + ASSERT_DBL_NEAR_TOL(x2[i], x1[i], DOUBLE_EPS); + ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); + } +} #endif