diff --git a/.github/workflows/loongarch64.yml b/.github/workflows/loongarch64.yml new file mode 100644 index 000000000..5501e98e0 --- /dev/null +++ b/.github/workflows/loongarch64.yml @@ -0,0 +1,110 @@ +name: loongarch64 qemu test + +on: [push, pull_request] + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: LOONGSONGENERIC + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSONGENERIC + - target: LOONGSON3R5 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON3R5 + - target: LOONGSON2K1000 + triple: loongarch64-unknown-linux-gnu + opts: NO_SHARED=1 TARGET=LOONGSON2K1000 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Install APT deps + run: | + sudo add-apt-repository ppa:savoury1/virtualisation + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + qemu-user-static + + - name: Download and install loongarch64-toolchain + run: | + wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz + tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt + + - name: Set env + run: | + echo "LD_LIBRARY_PATH=/opt/cross-tools/target/usr/lib64:/opt/cross-tools/loongarch64-unknown-linux-gnu/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV + echo "PATH=$GITHUB_WORKSPACE:/opt/cross-tools/bin:$PATH" >> $GITHUB_ENV + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: Disable utest dsdot:dsdot_n_1 + run: | + echo -n > utest/test_dsdot.c + echo "Due to the qemu versions 7.2 causing utest cases to fail," + echo "the utest dsdot:dsdot_n_1 have been temporarily disabled." + + - name: Build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: Test + run: | + qemu-loongarch64-static ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-loongarch64-static ./test/zblat3 < ./test/zblat3.dat diff --git a/Makefile.system b/Makefile.system index 71535b0cb..3c1648dc7 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1770,6 +1770,8 @@ export TARGET_CORE export NO_AVX512 export NO_AVX2 export BUILD_BFLOAT16 +export NO_LSX +export NO_LASX export SBGEMM_UNROLL_M export SBGEMM_UNROLL_N diff --git a/c_check b/c_check index 7c8494e4a..7ee183163 100755 --- a/c_check +++ b/c_check @@ -185,6 +185,37 @@ if [ "$architecture" = "mips" ] || [ "$architecture" = "mips64" ]; then rm -rf "$tmpd" fi +no_lsx=0 +no_lasx=0 +if [ "$architecture" = "loongarch64" ]; then + tmpd="$(mktemp -d)" + tmplsx="$tmpd/lsx.c" + codelsx='"vadd.b $vr0, $vr0, $vr0"' + lsx_flags='-march=loongarch64 -mlsx' + printf "#include \n\n" >> "$tmplsx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx" + args="$lsx_flags -o $tmplsx.o $tmplsx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lsx=1 + } + + tmplasx="$tmpd/lasx.c" + codelasx='"xvadd.b $xr0, $xr0, $xr0"' + lasx_flags='-march=loongarch64 -mlasx' + printf "#include \n\n" >> "$tmplasx" + printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx" + args="$lasx_flags -o $tmplasx.o $tmplasx" + { + $compiler_name $flags $args >/dev/null 2>&1 + } || { + no_lasx=1 + } + + rm -rf "$tmpd" +fi + case "$data" in *ARCH_X86_64*) architecture=x86_64 ;; *ARCH_X86*) architecture=x86 ;; @@ -399,6 +430,8 @@ done [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n" [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n" [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n" + [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n" + [ "$no_lasx" -eq 1 ] && printf "NO_LASX=1\n" } >> "$makefile" os=`echo "$os" | tr '[[:lower:]]' '[[:upper:]]'/ ` @@ -414,6 +447,8 @@ compiler=`echo "$compiler" | tr '[[:lower:]]' '[[:upper:]]' ` [ -n "$need_fu" ] && printf "#define FUNDERSCORE\t%s\n" "$need_fu" [ "$no_msa" -eq 1 ] && printf "#define NO_MSA\t1\n" [ "$c11_atomics" -eq 1 ] && printf "#define HAVE_C11\t1\n" + [ "$no_lsx" -eq 1 ] && printf "#define NO_LSX\t1\n" + [ "$no_lasx" -eq 1 ] && printf "#define NO_LASX\t1\n" } >> "$config" diff --git a/c_check.pl b/c_check.pl index 6ce28e11b..7a860a211 100644 --- a/c_check.pl +++ b/c_check.pl @@ -232,6 +232,47 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } } +$no_lsx = 0; +$no_lasx = 0; +if (($architecture eq "loongarch64")) { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check LSX and LASX capatibility"; + } else { + $tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelsx = '"vadd.b $vr0, $vr0, $vr0"'; + $lsx_flags = "-march=loongarch64 -mlsx"; + print $tmplsx "#include \n\n"; + print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n"; + + $args = "$lsx_flags -o $tmplsx.o $tmplsx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lsx = 1; + } else { + $no_lsx = 0; + } + unlink("$tmplsx.o"); + + $tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); + $codelasx = '"xvadd.b $xr0, $xr0, $xr0"'; + $lasx_flags = "-march=loongarch64 -mlasx"; + print $tmplasx "#include \n\n"; + print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n"; + + $args = "$lasx_flags -o $tmplasx.o $tmplasx"; + my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_lasx = 1; + } else { + $no_lasx = 0; + } + unlink("$tmplasx.o"); + } +} + $architecture = x86 if ($data =~ /ARCH_X86/); $architecture = x86_64 if ($data =~ /ARCH_X86_64/); $architecture = e2k if ($data =~ /ARCH_E2K/); @@ -424,6 +465,8 @@ print MAKEFILE "NO_RV64GV=1\n" if $no_rv64gv eq 1; print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1; print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1; print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1; +print MAKEFILE "NO_LSX=1\n" if $no_lsx eq 1; +print MAKEFILE "NO_LASX=1\n" if $no_lasx eq 1; $os =~ tr/[a-z]/[A-Z]/; $architecture =~ tr/[a-z]/[A-Z]/; @@ -437,6 +480,8 @@ print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; +print CONFFILE "#define NO_LSX\t1\n" if $no_lsx eq 1; +print CONFFILE "#define NO_LASX\t1\n" if $no_lasx eq 1; if ($os eq "LINUX") { diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c index ca07c7ffb..7c389db27 100644 --- a/cpuid_loongarch64.c +++ b/cpuid_loongarch64.c @@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include +#include /* If LASX extension instructions supported, * using core LOONGSON3R5 @@ -46,9 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CPU_LOONGSON3R5 1 #define CPU_LOONGSON2K1000 2 -#define LOONGARCH_CFG2 0x02 -#define LOONGARCH_LASX 1<<7 -#define LOONGARCH_LSX 1<<6 +#define LA_HWCAP_LSX (1<<4) +#define LA_HWCAP_LASX (1<<5) static char *cpuname[] = { "LOONGSONGENERIC", @@ -64,17 +64,11 @@ static char *cpuname_lower[] = { int detect(void) { #ifdef __linux - uint32_t reg = 0; + int flag = (int)getauxval(AT_HWCAP); - __asm__ volatile ( - "cpucfg %0, %1 \n\t" - : "+&r"(reg) - : "r"(LOONGARCH_CFG2) - ); - - if (reg & LOONGARCH_LASX) + if (flag & LA_HWCAP_LASX) return CPU_LOONGSON3R5; - else if (reg & LOONGARCH_LSX) + else if (flag & LA_HWCAP_LSX) return CPU_LOONGSON2K1000; else return CPU_GENERIC; diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5 index cda359040..253aa2464 100644 --- a/kernel/loongarch64/KERNEL.LOONGSON3R5 +++ b/kernel/loongarch64/KERNEL.LOONGSON3R5 @@ -1,3 +1,4 @@ +ifndef NO_LASX DGEMMKERNEL = dgemm_kernel_16x4.S DGEMMINCOPY = dgemm_ncopy_16.S DGEMMITCOPY = dgemm_tcopy_16.S @@ -7,6 +8,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/param.h b/param.h index 8fb4bcc48..547463b2f 100644 --- a/param.h +++ b/param.h @@ -2845,15 +2845,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x0ffffUL -#define SGEMM_DEFAULT_UNROLL_N 8 +#if defined(NO_LASX) +#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_M 2 +#else #define DGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_M 16 +#endif + +#define SGEMM_DEFAULT_UNROLL_N 8 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_N 4 #define XGEMM_DEFAULT_UNROLL_N 1 #define SGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_M 16 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 1 #define ZGEMM_DEFAULT_UNROLL_M 1