diff --git a/.github/workflows/mips64.yml b/.github/workflows/mips64.yml new file mode 100644 index 000000000..a5bd7b84b --- /dev/null +++ b/.github/workflows/mips64.yml @@ -0,0 +1,114 @@ +name: mips64 qemu test + +on: [push, pull_request] + +jobs: + TEST: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + include: + - target: MIPS64_GENERIC + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=MIPS64_GENERIC + - target: SICORTEX + triple: mips64el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=SICORTEX + - target: I6400 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6400 + - target: P6600 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=P6600 + - target: I6500 + triple: mipsisa64r6el-linux-gnuabi64 + opts: NO_SHARED=1 TARGET=I6500 + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: install build deps + run: | + sudo apt-get update + sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \ + gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross + + - name: checkout qemu + uses: actions/checkout@v3 + with: + repository: qemu/qemu + path: qemu + ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2 + + - name: build qemu + run: | + cd qemu + ./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system + make -j$(nproc) + make install + + - name: Compilation cache + uses: actions/cache@v3 + with: + path: ~/.ccache + key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }} + restore-keys: | + ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }} + ccache-${{ runner.os }}-${{ matrix.target }} + + - name: Configure ccache + run: | + test -d ~/.ccache || mkdir -p ~/.ccache + echo "max_size = 300M" > ~/.ccache/ccache.conf + echo "compression = true" >> ~/.ccache/ccache.conf + ccache -s + + - name: build OpenBLAS + run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc) + + - name: test + run: | + export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH + qemu-mips64el ./utest/openblas_utest + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1 + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1 + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT2.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat + rm -f ./test/?BLAT3.SUMM + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat + OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat diff --git a/kernel/mips/sdot_msa.c b/kernel/mips/sdot_msa.c index e02e10c61..8c250d401 100644 --- a/kernel/mips/sdot_msa.c +++ b/kernel/mips/sdot_msa.c @@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) FLOAT x0, x1, x2, x3, y0, y1, y2, y3; v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; +#if defined(DSDOT) + v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7; + v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7; + v2f64 dot0 = {0, 0}; + v2f64 dot1 = {0, 0}; + v2f64 dot2 = {0, 0}; + v2f64 dot3 = {0, 0}; +#else v4f32 dot0 = {0, 0, 0, 0}; v4f32 dot1 = {0, 0, 0, 0}; v4f32 dot2 = {0, 0, 0, 0}; v4f32 dot3 = {0, 0, 0, 0}; +#endif if (n < 1) return (dot); @@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x_pref += 32; y_pref += 32; +#if defined(DSDOT) + /* Extend single precision to double precision */ + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + dvy4 = __msa_fexupr_d(vy4); + dvy5 = __msa_fexupr_d(vy5); + dvy6 = __msa_fexupr_d(vy6); + dvy7 = __msa_fexupr_d(vy7); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + vy4 = (v4f32)__msa_fexupl_d(vy4); + vy5 = (v4f32)__msa_fexupl_d(vy5); + vy6 = (v4f32)__msa_fexupl_d(vy6); + vy7 = (v4f32)__msa_fexupl_d(vy7); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + dvx4 = __msa_fexupr_d(vx4); + dvx5 = __msa_fexupr_d(vx5); + dvx6 = __msa_fexupr_d(vx6); + dvx7 = __msa_fexupr_d(vx7); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + vx4 = (v4f32)__msa_fexupl_d(vx4); + vx5 = (v4f32)__msa_fexupl_d(vx5); + vx6 = (v4f32)__msa_fexupl_d(vx6); + vx7 = (v4f32)__msa_fexupl_d(vx7); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += (dvy4 * dvx4); + dot1 += (dvy5 * dvx5); + dot2 += (dvy6 * dvx6); + dot3 += (dvy7 * dvx7); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); + dot0 += ((v2f64)vy4 * (v2f64)vx4); + dot1 += ((v2f64)vy5 * (v2f64)vx5); + dot2 += ((v2f64)vy6 * (v2f64)vx6); + dot3 += ((v2f64)vy7 * (v2f64)vx7); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); @@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot1 += (vy5 * vx5); dot2 += (vy6 * vx6); dot3 += (vy7 * vx7); +#endif } if (n & 31) @@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3); LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + dvy2 = __msa_fexupr_d(vy2); + dvy3 = __msa_fexupr_d(vy3); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + vy2 = (v4f32)__msa_fexupl_d(vy2); + vy3 = (v4f32)__msa_fexupl_d(vy3); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + dvx2 = __msa_fexupr_d(vx2); + dvx3 = __msa_fexupr_d(vx3); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + vx2 = (v4f32)__msa_fexupl_d(vx2); + vx3 = (v4f32)__msa_fexupl_d(vx3); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot2 += (dvy2 * dvx2); + dot3 += (dvy3 * dvx3); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); + dot2 += ((v2f64)vy2 * (v2f64)vx2); + dot3 += ((v2f64)vy3 * (v2f64)vx3); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); dot2 += (vy2 * vx2); dot3 += (vy3 * vx3); +#endif } if (n & 8) @@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_SP2_INC(x, 4, vx0, vx1); LD_SP2_INC(y, 4, vy0, vy1); +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + dvy1 = __msa_fexupr_d(vy1); + + vy0 = (v4f32)__msa_fexupl_d(vy0); + vy1 = (v4f32)__msa_fexupl_d(vy1); + + dvx0 = __msa_fexupr_d(vx0); + dvx1 = __msa_fexupr_d(vx1); + + vx0 = (v4f32)__msa_fexupl_d(vx0); + vx1 = (v4f32)__msa_fexupl_d(vx1); + + dot0 += (dvy0 * dvx0); + dot1 += (dvy1 * dvx1); + dot0 += ((v2f64)vy0 * (v2f64)vx0); + dot1 += ((v2f64)vy1 * (v2f64)vx1); +#else dot0 += (vy0 * vx0); dot1 += (vy1 * vx1); +#endif } if (n & 4) @@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) vx0 = LD_SP(x); x += 4; vy0 = LD_SP(y); y += 4; +#if defined(DSDOT) + dvy0 = __msa_fexupr_d(vy0); + vy0 = (v4f32)__msa_fexupl_d(vy0); + dvx0 = __msa_fexupr_d(vx0); + vx0 = (v4f32)__msa_fexupl_d(vx0); + dot0 += (dvy0 * dvx0); + dot0 += ((v2f64)vy0 * (v2f64)vx0); +#else dot0 += (vy0 * vx0); +#endif } if (n & 2) @@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, 1, x0, x1); LD_GP2_INC(y, 1, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } } @@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) dot += dot0[0]; dot += dot0[1]; +#if !defined(DSDOT) dot += dot0[2]; dot += dot0[3]; +#endif } else { @@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP4_INC(x, inc_x, x0, x1, x2, x3); LD_GP4_INC(y, inc_y, y0, y1, y2, y3); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); + dot += ((double)y2 * (double)x2); + dot += ((double)y3 * (double)x3); +#else dot += (y0 * x0); dot += (y1 * x1); dot += (y2 * x2); dot += (y3 * x3); +#endif } if (n & 2) @@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) LD_GP2_INC(x, inc_x, x0, x1); LD_GP2_INC(y, inc_y, y0, y1); +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); + dot += ((double)y1 * (double)x1); +#else dot += (y0 * x0); dot += (y1 * x1); +#endif } if (n & 1) @@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) x0 = *x; y0 = *y; +#if defined(DSDOT) + dot += ((double)y0 * (double)x0); +#else dot += (y0 * x0); +#endif } }