diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3A index 91f2e7dd1..fc247e473 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3A @@ -1,6 +1,16 @@ SAXPYKERNEL=axpy_loongson3a.S DAXPYKERNEL=daxpy_loongson3a_simd.S +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c + + SGEMMKERNEL = sgemm_kernel_8x4_ps.S SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c diff --git a/kernel/mips64/gemv_n_loongson3a.c b/kernel/mips64/gemv_n_loongson3a.c new file mode 100644 index 000000000..bb27379f5 --- /dev/null +++ b/kernel/mips64/gemv_n_loongson3a.c @@ -0,0 +1,98 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0) +#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0) +#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) +#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) +{ + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 4; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(Y[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/gemv_t_loongson3a.c b/kernel/mips64/gemv_t_loongson3a.c new file mode 100644 index 000000000..5c6c8389e --- /dev/null +++ b/kernel/mips64/gemv_t_loongson3a.c @@ -0,0 +1,93 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0) +#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) +#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!ALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 3; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0; + + if(ALPHA == 1) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M);) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M);) { + norm_loop_alpha1; + } + } + } + } else { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[i + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M);) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY) { + BLASLONG i = 0, h = 0; + for(; likely(i < tMQ);) { + prefetch(A[LDA * j + i + fahead]); + prefetch(X[h + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M);) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_n_loongson3a.c b/kernel/mips64/zgemv_n_loongson3a.c new file mode 100644 index 000000000..f8275c371 --- /dev/null +++ b/kernel/mips64/zgemv_n_loongson3a.c @@ -0,0 +1,92 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. + +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 60; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCY == 1) { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(Y[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +} diff --git a/kernel/mips64/zgemv_t_loongson3a.c b/kernel/mips64/zgemv_t_loongson3a.c new file mode 100644 index 000000000..4b2c2b6b5 --- /dev/null +++ b/kernel/mips64/zgemv_t_loongson3a.c @@ -0,0 +1,91 @@ +#include "common.h" + +//These are auto-tuning codes on Loongson-3A platform. +//#define prefetch(x) __builtin_prefetch(x) +//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0) +#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x)) +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define spec_loop_alpha1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0) +//#define spec_loop_alpha1 do {Y[ii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[ii + 1] += A[jj + ii + 1] * X[k] + A[jj + ii] * X[k + 1]; ii += 2;} while(0) +#define spec_loop do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0) +#define norm_loop_alpha1 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0) +#define norm_loop do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0) + +int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) { + + if(!rALPHA && iALPHA) + return 0; + + if(INCX < 0) + INCX = -INCX; + if(INCY < 0) + INCY = -INCY; + + BLASLONG fahead = 30; + BLASLONG spec_unroll = 2; + BLASLONG tMQ = M - M % spec_unroll; + BLASLONG j = 0, k = 0, jj=0; + + + if(rALPHA == 1 && iALPHA == 0) { + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop_alpha1; + /*loop_mark*/ spec_loop_alpha1; + } + for(; likely(i < M); i++) { + spec_loop_alpha1; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop_alpha1; + /*loop_mark*/ norm_loop_alpha1; + } + for(; likely(i < M); i++) { + norm_loop_alpha1; + } + } + } + } else { + FLOAT rTmp, iTmp; + if(INCX == 1) { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[ii + fahead]); + /*loop_mark*/ spec_loop; + /*loop_mark*/ spec_loop; + } + for(; likely(i < M); i++) { + spec_loop; + } + } + } else { + for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) { + BLASLONG i = 0, ii = 0, iii = 0; + for(; likely(i < tMQ); i += spec_unroll) { + prefetch(A[jj + ii + fahead]); + prefetch(X[iii + fahead]); + /*loop_mark*/ norm_loop; + /*loop_mark*/ norm_loop; + } + for(; likely(i < M); i++) { + norm_loop; + } + } + } + } + return 0; +}