Merge branch 'xianyi:develop' into travispytorch

This commit is contained in:
Martin Kroeker 2022-09-21 14:40:36 +02:00 committed by GitHub
commit db1c6a0b0f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
34 changed files with 506 additions and 68 deletions

114
.github/workflows/mips64.yml vendored Normal file
View File

@ -0,0 +1,114 @@
name: mips64 qemu test
on: [push, pull_request]
jobs:
TEST:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
include:
- target: MIPS64_GENERIC
triple: mips64el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=MIPS64_GENERIC
- target: SICORTEX
triple: mips64el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=SICORTEX
- target: I6400
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=I6400
- target: P6600
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=P6600
- target: I6500
triple: mipsisa64r6el-linux-gnuabi64
opts: NO_SHARED=1 TARGET=I6500
steps:
- name: Checkout repository
uses: actions/checkout@v3
- name: install build deps
run: |
sudo apt-get update
sudo apt-get install autoconf automake autotools-dev ninja-build make ccache \
gcc-${{ matrix.triple }} gfortran-${{ matrix.triple }} libgomp1-mips64el-cross
- name: checkout qemu
uses: actions/checkout@v3
with:
repository: qemu/qemu
path: qemu
ref: 79dfa177ae348bb5ab5f97c0915359b13d6186e2
- name: build qemu
run: |
cd qemu
./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=mips64el-linux-user --disable-system
make -j$(nproc)
make install
- name: Compilation cache
uses: actions/cache@v3
with:
path: ~/.ccache
key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
restore-keys: |
ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
ccache-${{ runner.os }}-${{ matrix.target }}
- name: Configure ccache
run: |
test -d ~/.ccache || mkdir -p ~/.ccache
echo "max_size = 300M" > ~/.ccache/ccache.conf
echo "compression = true" >> ~/.ccache/ccache.conf
ccache -s
- name: build OpenBLAS
run: make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)
- name: test
run: |
export PATH=$GITHUB_WORKSPACE/qemu-install/bin/:$PATH
qemu-mips64el ./utest/openblas_utest
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat2 < ./ctest/sin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat2 < ./ctest/din2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat2 < ./ctest/cin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat2 < ./ctest/zin2
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xscblat3 < ./ctest/sin3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xdcblat3 < ./ctest/din3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xccblat3 < ./ctest/cin3
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./ctest/xzcblat3 < ./ctest/zin3
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat1
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat1
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat1
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT2.SUMM
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat2 < ./test/sblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat2 < ./test/dblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat2 < ./test/cblat2.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat2 < ./test/zblat2.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat
rm -f ./test/?BLAT3.SUMM
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/sblat3 < ./test/sblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/dblat3 < ./test/dblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/cblat3 < ./test/cblat3.dat
OPENBLAS_NUM_THREADS=2 qemu-mips64el ./test/zblat3 < ./test/zblat3.dat

View File

@ -197,14 +197,14 @@ if (DEFINED TARGET)
if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
endif()
if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
if ((${TARGET} STREQUAL HASWELL OR ${TARGET} STREQUAL ZEN) AND NOT NO_AVX2)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
endif()
elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2 -mfma")
endif()
endif()
if (DEFINED HAVE_AVX)

View File

@ -387,6 +387,10 @@ typedef int blasint;
#endif
*/
#ifdef __EMSCRIPTEN__
#define YIELDING
#endif
#ifndef YIELDING
#define YIELDING sched_yield()
#endif

View File

@ -173,3 +173,8 @@ HAVE_C11
ARCH_E2K
#endif
#if defined(__EMSCRIPTEN__)
ARCH_RISCV64
OS_WINDOWS
#endif

View File

@ -969,7 +969,7 @@ real *sfac;
1.17 };
/* Local variables */
extern /* Subroutine */ srottest_();
extern /* Subroutine */ void srottest_();
static integer i__, k, ksize;
extern /* Subroutine */ int stest_(), srotmtest_();
static integer ki, kn;

View File

@ -69,6 +69,8 @@
int blas_server_avail = 0;
extern int openblas_omp_adaptive_env();
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
#ifdef HAVE_C11
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];

View File

@ -23,7 +23,7 @@ ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2
AVX2OPT = -mavx2 -mfma
endif
endif
ifdef NO_AVX2
@ -73,6 +73,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
endif
else ifeq ($(TARGET_CORE), HASWELL)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), ZEN)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
else

View File

@ -39,10 +39,19 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
#if defined(DSDOT)
v2f64 dvx0, dvx1, dvx2, dvx3, dvx4, dvx5, dvx6, dvx7;
v2f64 dvy0, dvy1, dvy2, dvy3, dvy4, dvy5, dvy6, dvy7;
v2f64 dot0 = {0, 0};
v2f64 dot1 = {0, 0};
v2f64 dot2 = {0, 0};
v2f64 dot3 = {0, 0};
#else
v4f32 dot0 = {0, 0, 0, 0};
v4f32 dot1 = {0, 0, 0, 0};
v4f32 dot2 = {0, 0, 0, 0};
v4f32 dot3 = {0, 0, 0, 0};
#endif
if (n < 1) return (dot);
@ -83,6 +92,61 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
x_pref += 32;
y_pref += 32;
#if defined(DSDOT)
/* Extend single precision to double precision */
dvy0 = __msa_fexupr_d(vy0);
dvy1 = __msa_fexupr_d(vy1);
dvy2 = __msa_fexupr_d(vy2);
dvy3 = __msa_fexupr_d(vy3);
dvy4 = __msa_fexupr_d(vy4);
dvy5 = __msa_fexupr_d(vy5);
dvy6 = __msa_fexupr_d(vy6);
dvy7 = __msa_fexupr_d(vy7);
vy0 = (v4f32)__msa_fexupl_d(vy0);
vy1 = (v4f32)__msa_fexupl_d(vy1);
vy2 = (v4f32)__msa_fexupl_d(vy2);
vy3 = (v4f32)__msa_fexupl_d(vy3);
vy4 = (v4f32)__msa_fexupl_d(vy4);
vy5 = (v4f32)__msa_fexupl_d(vy5);
vy6 = (v4f32)__msa_fexupl_d(vy6);
vy7 = (v4f32)__msa_fexupl_d(vy7);
dvx0 = __msa_fexupr_d(vx0);
dvx1 = __msa_fexupr_d(vx1);
dvx2 = __msa_fexupr_d(vx2);
dvx3 = __msa_fexupr_d(vx3);
dvx4 = __msa_fexupr_d(vx4);
dvx5 = __msa_fexupr_d(vx5);
dvx6 = __msa_fexupr_d(vx6);
dvx7 = __msa_fexupr_d(vx7);
vx0 = (v4f32)__msa_fexupl_d(vx0);
vx1 = (v4f32)__msa_fexupl_d(vx1);
vx2 = (v4f32)__msa_fexupl_d(vx2);
vx3 = (v4f32)__msa_fexupl_d(vx3);
vx4 = (v4f32)__msa_fexupl_d(vx4);
vx5 = (v4f32)__msa_fexupl_d(vx5);
vx6 = (v4f32)__msa_fexupl_d(vx6);
vx7 = (v4f32)__msa_fexupl_d(vx7);
dot0 += (dvy0 * dvx0);
dot1 += (dvy1 * dvx1);
dot2 += (dvy2 * dvx2);
dot3 += (dvy3 * dvx3);
dot0 += (dvy4 * dvx4);
dot1 += (dvy5 * dvx5);
dot2 += (dvy6 * dvx6);
dot3 += (dvy7 * dvx7);
dot0 += ((v2f64)vy0 * (v2f64)vx0);
dot1 += ((v2f64)vy1 * (v2f64)vx1);
dot2 += ((v2f64)vy2 * (v2f64)vx2);
dot3 += ((v2f64)vy3 * (v2f64)vx3);
dot0 += ((v2f64)vy4 * (v2f64)vx4);
dot1 += ((v2f64)vy5 * (v2f64)vx5);
dot2 += ((v2f64)vy6 * (v2f64)vx6);
dot3 += ((v2f64)vy7 * (v2f64)vx7);
#else
dot0 += (vy0 * vx0);
dot1 += (vy1 * vx1);
dot2 += (vy2 * vx2);
@ -91,6 +155,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot1 += (vy5 * vx5);
dot2 += (vy6 * vx6);
dot3 += (vy7 * vx7);
#endif
}
if (n & 31)
@ -100,10 +165,41 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
#if defined(DSDOT)
dvy0 = __msa_fexupr_d(vy0);
dvy1 = __msa_fexupr_d(vy1);
dvy2 = __msa_fexupr_d(vy2);
dvy3 = __msa_fexupr_d(vy3);
vy0 = (v4f32)__msa_fexupl_d(vy0);
vy1 = (v4f32)__msa_fexupl_d(vy1);
vy2 = (v4f32)__msa_fexupl_d(vy2);
vy3 = (v4f32)__msa_fexupl_d(vy3);
dvx0 = __msa_fexupr_d(vx0);
dvx1 = __msa_fexupr_d(vx1);
dvx2 = __msa_fexupr_d(vx2);
dvx3 = __msa_fexupr_d(vx3);
vx0 = (v4f32)__msa_fexupl_d(vx0);
vx1 = (v4f32)__msa_fexupl_d(vx1);
vx2 = (v4f32)__msa_fexupl_d(vx2);
vx3 = (v4f32)__msa_fexupl_d(vx3);
dot0 += (dvy0 * dvx0);
dot1 += (dvy1 * dvx1);
dot2 += (dvy2 * dvx2);
dot3 += (dvy3 * dvx3);
dot0 += ((v2f64)vy0 * (v2f64)vx0);
dot1 += ((v2f64)vy1 * (v2f64)vx1);
dot2 += ((v2f64)vy2 * (v2f64)vx2);
dot3 += ((v2f64)vy3 * (v2f64)vx3);
#else
dot0 += (vy0 * vx0);
dot1 += (vy1 * vx1);
dot2 += (vy2 * vx2);
dot3 += (vy3 * vx3);
#endif
}
if (n & 8)
@ -111,8 +207,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
LD_SP2_INC(x, 4, vx0, vx1);
LD_SP2_INC(y, 4, vy0, vy1);
#if defined(DSDOT)
dvy0 = __msa_fexupr_d(vy0);
dvy1 = __msa_fexupr_d(vy1);
vy0 = (v4f32)__msa_fexupl_d(vy0);
vy1 = (v4f32)__msa_fexupl_d(vy1);
dvx0 = __msa_fexupr_d(vx0);
dvx1 = __msa_fexupr_d(vx1);
vx0 = (v4f32)__msa_fexupl_d(vx0);
vx1 = (v4f32)__msa_fexupl_d(vx1);
dot0 += (dvy0 * dvx0);
dot1 += (dvy1 * dvx1);
dot0 += ((v2f64)vy0 * (v2f64)vx0);
dot1 += ((v2f64)vy1 * (v2f64)vx1);
#else
dot0 += (vy0 * vx0);
dot1 += (vy1 * vx1);
#endif
}
if (n & 4)
@ -120,7 +235,16 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
vx0 = LD_SP(x); x += 4;
vy0 = LD_SP(y); y += 4;
#if defined(DSDOT)
dvy0 = __msa_fexupr_d(vy0);
vy0 = (v4f32)__msa_fexupl_d(vy0);
dvx0 = __msa_fexupr_d(vx0);
vx0 = (v4f32)__msa_fexupl_d(vx0);
dot0 += (dvy0 * dvx0);
dot0 += ((v2f64)vy0 * (v2f64)vx0);
#else
dot0 += (vy0 * vx0);
#endif
}
if (n & 2)
@ -128,8 +252,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
LD_GP2_INC(x, 1, x0, x1);
LD_GP2_INC(y, 1, y0, y1);
#if defined(DSDOT)
dot += ((double)y0 * (double)x0);
dot += ((double)y1 * (double)x1);
#else
dot += (y0 * x0);
dot += (y1 * x1);
#endif
}
if (n & 1)
@ -137,7 +266,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
x0 = *x;
y0 = *y;
#if defined(DSDOT)
dot += ((double)y0 * (double)x0);
#else
dot += (y0 * x0);
#endif
}
}
@ -145,8 +278,10 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
dot += dot0[0];
dot += dot0[1];
#if !defined(DSDOT)
dot += dot0[2];
dot += dot0[3];
#endif
}
else
{
@ -155,10 +290,17 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
#if defined(DSDOT)
dot += ((double)y0 * (double)x0);
dot += ((double)y1 * (double)x1);
dot += ((double)y2 * (double)x2);
dot += ((double)y3 * (double)x3);
#else
dot += (y0 * x0);
dot += (y1 * x1);
dot += (y2 * x2);
dot += (y3 * x3);
#endif
}
if (n & 2)
@ -166,8 +308,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(y, inc_y, y0, y1);
#if defined(DSDOT)
dot += ((double)y0 * (double)x0);
dot += ((double)y1 * (double)x1);
#else
dot += (y0 * x0);
dot += (y1 * x1);
#endif
}
if (n & 1)
@ -175,7 +322,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
x0 = *x;
y0 = *y;
#if defined(DSDOT)
dot += ((double)y0 * (double)x0);
#else
dot += (y0 * x0);
#endif
}
}

View File

@ -0,0 +1,160 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../mips/amax.c
DAMAXKERNEL = ../mips/amax.c
CAMAXKERNEL = ../mips/zamax.c
ZAMAXKERNEL = ../mips/zamax.c
SAMINKERNEL = ../mips/amin.c
DAMINKERNEL = ../mips/amin.c
CAMINKERNEL = ../mips/zamin.c
ZAMINKERNEL = ../mips/zamin.c
SMAXKERNEL = ../mips/max.c
DMAXKERNEL = ../mips/max.c
SMINKERNEL = ../mips/min.c
DMINKERNEL = ../mips/min.c
ISAMAXKERNEL = ../mips/iamax.c
IDAMAXKERNEL = ../mips/iamax.c
ICAMAXKERNEL = ../mips/izamax.c
IZAMAXKERNEL = ../mips/izamax.c
ISAMINKERNEL = ../mips/iamin.c
IDAMINKERNEL = ../mips/iamin.c
ICAMINKERNEL = ../mips/izamin.c
IZAMINKERNEL = ../mips/izamin.c
ISMAXKERNEL = ../mips/imax.c
IDMAXKERNEL = ../mips/imax.c
ISMINKERNEL = ../mips/imin.c
IDMINKERNEL = ../mips/imin.c
SASUMKERNEL = ../mips/asum.c
DASUMKERNEL = ../mips/asum.c
CASUMKERNEL = ../mips/zasum.c
ZASUMKERNEL = ../mips/zasum.c
SSUMKERNEL = ../mips/sum.c
DSUMKERNEL = ../mips/sum.c
CSUMKERNEL = ../mips/zsum.c
ZSUMKERNEL = ../mips/zsum.c
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
SDOTKERNEL = ../mips/dot.c
DDOTKERNEL = ../mips/dot.c
CDOTKERNEL = ../mips/zdot.c
ZDOTKERNEL = ../mips/zdot.c
SNRM2KERNEL = ../mips/nrm2.c
DNRM2KERNEL = ../mips/nrm2.c
CNRM2KERNEL = ../mips/znrm2.c
ZNRM2KERNEL = ../mips/znrm2.c
SROTKERNEL = ../mips/rot.c
DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
SGEMVNKERNEL = ../mips/gemv_n.c
DGEMVNKERNEL = ../mips/gemv_n.c
CGEMVNKERNEL = ../mips/zgemv_n.c
ZGEMVNKERNEL = ../mips/zgemv_n.c
SGEMVTKERNEL = ../mips/gemv_t.c
DGEMVTKERNEL = ../mips/gemv_t.c
CGEMVTKERNEL = ../mips/zgemv_t.c
ZGEMVTKERNEL = ../mips/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -90,7 +90,7 @@
//Init INF
lui TEMP, 0x7FF0
dsll TEMP, TEMP, 32
MTC1 TEMP, INF
MTC TEMP, INF
LD a1, 0 * SIZE(X)
daddiu N, N, -1

View File

@ -52,18 +52,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict a, BLASLONG lda, FLOAT * __
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
FLOAT ctemp17 /*, ctemp18, ctemp19, ctemp20*/ ;
FLOAT /*ctemp21, ctemp22,*/ ctemp23, ctemp24;
FLOAT ctemp25 /*, ctemp26, ctemp27, ctemp28*/ ;
FLOAT /*ctemp29, ctemp30,*/ ctemp31, ctemp32;
FLOAT ctemp33 /*, ctemp34, ctemp35, ctemp36*/ ;
FLOAT /*ctemp37, ctemp38,*/ ctemp39, ctemp40;
FLOAT ctemp41 /*, ctemp42, ctemp43, ctemp44*/ ;
FLOAT /*ctemp45, ctemp46,*/ ctemp47, ctemp48;
FLOAT ctemp49 /*, ctemp50, ctemp51, ctemp52*/ ;
FLOAT /*ctemp53, ctemp54,*/ ctemp55, ctemp56;
FLOAT ctemp57 /*, ctemp58, ctemp59, ctemp60*/ ;
FLOAT /*ctemp61, ctemp62,*/ ctemp63, ctemp64;
aoffset = a;

View File

@ -142,7 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
,"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
}
int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb){
float *src, *dst, *dst_tmp, *src_base, *dst_base;
float *src, *dst, *dst_tmp=0, *src_base, *dst_base;
uint64_t src_ld_bytes = (uint64_t)lda * sizeof(float), dst_ld_bytes = (uint64_t)ldb * sizeof(float), num_rows = 0;
BLASLONG cols_left, rows_done; float ALPHA = alpha;
if(ALPHA==0.0){

View File

@ -796,10 +796,10 @@ L10:
temp = log((real) (*n)) / log(2.f);
lgn = (integer) temp;
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
iprmpt = indxq + *n + 1;

View File

@ -864,11 +864,11 @@ f"> */
/* Form the z-vector which consists of the last row of Q_1 and the */
/* first row of Q_2. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = *tlvls - i__;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L10: */
}
curr = ptr + *curpbm;

View File

@ -1051,7 +1051,7 @@ f"> */
/* Finally go through the left singular vector matrices of all */
/* the other subproblems bottom-up on the tree. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
sqre = 0;
for (lvl = nlvl; lvl >= 1; --lvl) {
@ -1065,7 +1065,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;
@ -1110,7 +1110,7 @@ L170:
ll = 1;
} else {
i__2 = lvl - 1;
lf = pow_ii(&c__2, &i__2);
lf = pow_ii(c__2, i__2);
ll = (lf << 1) - 1;
}
i__2 = lf;

View File

@ -836,10 +836,10 @@ f"> */
lrwmin = *n - 1 << 1;
} else if (icompz == 1) {
lgn = (integer) (log((real) (*n)) / log(2.f));
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
lwmin = *n * *n;

View File

@ -827,10 +827,10 @@ L10:
temp = log((doublereal) (*n)) / log(2.);
lgn = (integer) temp;
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
iprmpt = indxq + *n + 1;

View File

@ -885,11 +885,11 @@ f"> */
/* Form the z-vector which consists of the last row of Q_1 and the */
/* first row of Q_2. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = *tlvls - i__;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L10: */
}
curr = ptr + *curpbm;

View File

@ -754,7 +754,7 @@ f"> */
/* scheme */
i__1 = *curlvl - 1;
curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1;
/* Determine size of these matrices. We add HALF to the value of */
/* the SQRT in case the machine underestimates one of these square */
@ -781,12 +781,12 @@ f"> */
/* rotations and permutation and then multiplying the center matrices */
/* against the current Z. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (k = 1; k <= i__1; ++k) {
i__2 = *curlvl - k;
i__3 = *curlvl - k - 1;
curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) -
curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) -
1;
psiz1 = prmptr[curr + 1] - prmptr[curr];
psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
@ -847,7 +847,7 @@ f"> */
c__1);
i__2 = *tlvls - k;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L70: */
}

View File

@ -951,7 +951,7 @@ f"> */
/* Finally go through the left singular vector matrices of all */
/* the other subproblems bottom-up on the tree. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
sqre = 0;
for (lvl = nlvl; lvl >= 1; --lvl) {
@ -965,7 +965,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;
@ -1010,7 +1010,7 @@ L50:
ll = 1;
} else {
i__2 = lvl - 1;
lf = pow_ii(&c__2, &i__2);
lf = pow_ii(c__2, i__2);
ll = (lf << 1) - 1;
}
i__2 = lf;

View File

@ -824,7 +824,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;

View File

@ -1027,7 +1027,7 @@ f"> */
/* Now conquer each subproblem bottom-up. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
for (lvl = nlvl; lvl >= 1; --lvl) {
lvl2 = (lvl << 1) - 1;
@ -1039,7 +1039,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;

View File

@ -806,10 +806,10 @@ f"> */
lwmin = *n - 1 << 1;
} else {
lgn = (integer) (log((doublereal) (*n)) / log(2.));
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (icompz == 1) {

View File

@ -823,10 +823,10 @@ L10:
temp = log((real) (*n)) / log(2.f);
lgn = (integer) temp;
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
iprmpt = indxq + *n + 1;

View File

@ -883,11 +883,11 @@ f"> */
/* Form the z-vector which consists of the last row of Q_1 and the */
/* first row of Q_2. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = *tlvls - i__;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L10: */
}
curr = ptr + *curpbm;

View File

@ -753,7 +753,7 @@ f"> */
/* scheme */
i__1 = *curlvl - 1;
curr = ptr + *curpbm * pow_ii(&c__2, curlvl) + pow_ii(&c__2, &i__1) - 1;
curr = ptr + *curpbm * pow_ii(c__2, *curlvl) + pow_ii(c__2, i__1) - 1;
/* Determine size of these matrices. We add HALF to the value of */
/* the SQRT in case the machine underestimates one of these square */
@ -779,12 +779,12 @@ f"> */
/* rotations and permutation and then multiplying the center matrices */
/* against the current Z. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (k = 1; k <= i__1; ++k) {
i__2 = *curlvl - k;
i__3 = *curlvl - k - 1;
curr = ptr + *curpbm * pow_ii(&c__2, &i__2) + pow_ii(&c__2, &i__3) -
curr = ptr + *curpbm * pow_ii(c__2, i__2) + pow_ii(c__2, i__3) -
1;
psiz1 = prmptr[curr + 1] - prmptr[curr];
psiz2 = prmptr[curr + 2] - prmptr[curr + 1];
@ -844,7 +844,7 @@ f"> */
c__1);
i__2 = *tlvls - k;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L70: */
}

View File

@ -946,7 +946,7 @@ f"> */
/* Finally go through the left singular vector matrices of all */
/* the other subproblems bottom-up on the tree. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
sqre = 0;
for (lvl = nlvl; lvl >= 1; --lvl) {
@ -960,7 +960,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;
@ -1005,7 +1005,7 @@ L50:
ll = 1;
} else {
i__2 = lvl - 1;
lf = pow_ii(&c__2, &i__2);
lf = pow_ii(c__2, i__2);
ll = (lf << 1) - 1;
}
i__2 = lf;

View File

@ -821,7 +821,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;

View File

@ -1023,7 +1023,7 @@ f"> */
/* Now conquer each subproblem bottom-up. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
for (lvl = nlvl; lvl >= 1; --lvl) {
lvl2 = (lvl << 1) - 1;
@ -1035,7 +1035,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;

View File

@ -804,10 +804,10 @@ f"> */
lwmin = *n - 1 << 1;
} else {
lgn = (integer) (log((real) (*n)) / log(2.f));
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (icompz == 1) {

View File

@ -793,10 +793,10 @@ L10:
temp = log((doublereal) (*n)) / log(2.);
lgn = (integer) temp;
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
iprmpt = indxq + *n + 1;

View File

@ -864,11 +864,11 @@ f"> */
/* Form the z-vector which consists of the last row of Q_1 and the */
/* first row of Q_2. */
ptr = pow_ii(&c__2, tlvls) + 1;
ptr = pow_ii(c__2, *tlvls) + 1;
i__1 = *curlvl - 1;
for (i__ = 1; i__ <= i__1; ++i__) {
i__2 = *tlvls - i__;
ptr += pow_ii(&c__2, &i__2);
ptr += pow_ii(c__2, i__2);
/* L10: */
}
curr = ptr + *curpbm;

View File

@ -1051,7 +1051,7 @@ f"> */
/* Finally go through the left singular vector matrices of all */
/* the other subproblems bottom-up on the tree. */
j = pow_ii(&c__2, &nlvl);
j = pow_ii(c__2, nlvl);
sqre = 0;
for (lvl = nlvl; lvl >= 1; --lvl) {
@ -1065,7 +1065,7 @@ f"> */
ll = 1;
} else {
i__1 = lvl - 1;
lf = pow_ii(&c__2, &i__1);
lf = pow_ii(c__2, i__1);
ll = (lf << 1) - 1;
}
i__1 = ll;
@ -1110,7 +1110,7 @@ L170:
ll = 1;
} else {
i__2 = lvl - 1;
lf = pow_ii(&c__2, &i__2);
lf = pow_ii(c__2, i__2);
ll = (lf << 1) - 1;
}
i__2 = lf;

View File

@ -836,10 +836,10 @@ f"> */
lrwmin = *n - 1 << 1;
} else if (icompz == 1) {
lgn = (integer) (log((doublereal) (*n)) / log(2.));
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
if (pow_ii(&c__2, &lgn) < *n) {
if (pow_ii(c__2, lgn) < *n) {
++lgn;
}
lwmin = *n * *n;