Merge branch 'loongson3a' into release-0.1.0
This commit is contained in:
commit
83ecfbb9b3
1
README
1
README
|
@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
|
|||
9.Known Issues
|
||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
||||
is 64. On 32 bits, it is 32.
|
||||
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
|
||||
|
||||
10. Specification of Git Branches
|
||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
||||
|
|
|
@ -2127,7 +2127,9 @@
|
|||
#endif
|
||||
|
||||
#ifndef ASSEMBLER
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG sgemm_p;
|
||||
extern BLASLONG sgemm_q;
|
||||
extern BLASLONG sgemm_r;
|
||||
|
|
|
@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define CMPEQ c.eq.d
|
||||
#define CMPLE c.le.d
|
||||
#define CMPLT c.lt.d
|
||||
#define NEG neg.d
|
||||
#else
|
||||
#define LD lwc1
|
||||
#define ST swc1
|
||||
|
@ -170,6 +171,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#define CMPEQ c.eq.s
|
||||
#define CMPLE c.le.s
|
||||
#define CMPLT c.lt.s
|
||||
#define PLU plu.ps
|
||||
#define PLL pll.ps
|
||||
#define PUU puu.ps
|
||||
#define PUL pul.ps
|
||||
#define MADPS madd.ps
|
||||
#define CVTU cvt.s.pu
|
||||
#define CVTL cvt.s.pl
|
||||
#define NEG neg.s
|
||||
#endif
|
||||
|
||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||
|
@ -218,7 +227,7 @@ REALNAME: ;\
|
|||
|
||||
#define SEEK_ADDRESS
|
||||
|
||||
#define BUFFER_SIZE ( 8 << 20)
|
||||
#define BUFFER_SIZE ( 32 << 20)
|
||||
|
||||
#if defined(LOONGSON3A)
|
||||
#define PAGESIZE (16UL << 10)
|
||||
|
|
|
@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
queue[num_cpu].args = arg;
|
||||
queue[num_cpu].range_m = range_m;
|
||||
queue[num_cpu].range_n = &range[num_cpu];
|
||||
queue[num_cpu].sa = NULL;
|
||||
#if defined(LOONGSON3A)
|
||||
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
||||
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[num_cpu].sa = NULL;
|
||||
queue[num_cpu].sb = NULL;
|
||||
#endif
|
||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||
num_cpu ++;
|
||||
}
|
||||
|
||||
if (num_cpu) {
|
||||
#if defined(LOONGSON3A)
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
|
||||
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
||||
#else
|
||||
queue[0].sa = sa;
|
||||
queue[0].sb = sb;
|
||||
#endif
|
||||
queue[num_cpu - 1].next = NULL;
|
||||
|
||||
exec_blas(num_cpu,
|
||||
|
|
|
@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
|
|||
/* Initializing routine */
|
||||
int blas_thread_init(void){
|
||||
BLASLONG i;
|
||||
int ret;
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_attr_t attr;
|
||||
#endif
|
||||
|
@ -545,12 +546,16 @@ int blas_thread_init(void){
|
|||
pthread_cond_init (&thread_status[i].wakeup, NULL);
|
||||
|
||||
#ifdef NEED_STACKATTR
|
||||
pthread_create(&blas_threads[i], &attr,
|
||||
ret=pthread_create(&blas_threads[i], &attr,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#else
|
||||
pthread_create(&blas_threads[i], NULL,
|
||||
ret=pthread_create(&blas_threads[i], NULL,
|
||||
(void *)&blas_thread_server, (void *)i);
|
||||
#endif
|
||||
if(ret!=0){
|
||||
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef MONITOR
|
||||
|
@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
blas_cpu_number = num_threads;
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
|
|
@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
|
|||
|
||||
omp_set_num_threads(blas_cpu_number);
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
//set parameters for different number of threads.
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
||||
}
|
||||
void openblas_set_num_threads(int num_threads) {
|
||||
|
||||
|
|
|
@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
|
|||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||
#ifndef DYNAMIC_ARCH
|
||||
blas_set_parameter();
|
||||
#endif
|
||||
|
|
|
@ -45,8 +45,22 @@ int get_L2_size(void);
|
|||
#define DEFAULT_GEMM_P 128
|
||||
#define DEFAULT_GEMM_Q 128
|
||||
#define DEFAULT_GEMM_R 128
|
||||
#define DEFAULT_GEMM_OFFSET_A 0
|
||||
#define DEFAULT_GEMM_OFFSET_B 0
|
||||
|
||||
/* Global Parameter */
|
||||
#if GEMM_OFFSET_A == gemm_offset_a
|
||||
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
|
||||
#else
|
||||
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
|
||||
#endif
|
||||
|
||||
#if GEMM_OFFSET_B == gemm_offset_b
|
||||
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
||||
#else
|
||||
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||
#endif
|
||||
|
||||
#if SGEMM_P == sgemm_p
|
||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||
#else
|
||||
|
@ -666,3 +680,21 @@ void blas_set_parameter(void){
|
|||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(ARCH_MIPS64)
|
||||
void blas_set_parameter(void){
|
||||
#if defined(LOONGSON3A)
|
||||
#ifdef SMP
|
||||
if(blas_num_threads == 1){
|
||||
#endif
|
||||
//single thread
|
||||
dgemm_r = 1024;
|
||||
#ifdef SMP
|
||||
}else{
|
||||
//multi thread
|
||||
dgemm_r = 200;
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
|
@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
|
@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
|
@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
|
|
|
@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
|
|||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LN
|
||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_LT
|
||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RN
|
||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef CTRSMKERNEL_RT
|
||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LN
|
||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_LT
|
||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RN
|
||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||
endif
|
||||
|
||||
ifndef ZTRSMKERNEL_RT
|
||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||
endif
|
||||
|
||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||
|
|
|
@ -1,18 +1,48 @@
|
|||
SAXPYKERNEL=axpy_loongson3a.S
|
||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_loongson3a.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = gemm_kernel_loongson3a.S
|
||||
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
|||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,101 @@
|
|||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
|
||||
{
|
||||
|
||||
BLASLONG kx=0, ky=0;
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
//if(INCX < 0)
|
||||
// kx = (1-N) * INCX;
|
||||
// INCX = -INCX;
|
||||
//if(INCY < 0)
|
||||
// ky = (1-M) * INCY;
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 4;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCY == 1) {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||
BLASLONG i = 0, h = ky;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(Y[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
#include "common.h"
|
||||
|
||||
//These are auto-tuning codes on Loongson-3A platform.
|
||||
|
||||
//#define prefetch(x) __builtin_prefetch(x)
|
||||
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
|
||||
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!ALPHA)
|
||||
return 0;
|
||||
|
||||
// if(INCX < 0)
|
||||
// INCX = -INCX;
|
||||
// if(INCY < 0)
|
||||
// INCY = -INCY;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 3;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0;
|
||||
|
||||
if(ALPHA == 1) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[i + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY) {
|
||||
BLASLONG i = 0, h = 0;
|
||||
for(; likely(i < tMQ);) {
|
||||
prefetch(A[LDA * j + i + fahead]);
|
||||
prefetch(X[h + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M);) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,139 @@
|
|||
#include "common.h"
|
||||
|
||||
//typedef int BLASLONG;
|
||||
//typedef double FLOAT;
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 60;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCY == 1) {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(Y[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
#include "common.h"
|
||||
|
||||
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||
#define likely(x) __builtin_expect(!!(x), 1)
|
||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||
|
||||
#if !defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||
#define spec_loop spec_loop_0
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||
#define norm_loop norm_loop_0
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && !defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||
#define spec_loop spec_loop_1
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||
#define norm_loop norm_loop_1
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||
#define spec_loop spec_loop_2
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||
#define norm_loop norm_loop_2
|
||||
#endif
|
||||
|
||||
#if defined(CONJ) && defined(XCONJ)
|
||||
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||
#define spec_loop spec_loop_3
|
||||
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||
#define norm_loop norm_loop_3
|
||||
#endif
|
||||
|
||||
|
||||
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||
|
||||
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||
|
||||
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||
|
||||
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||
|
||||
if(!rALPHA && iALPHA)
|
||||
return 0;
|
||||
|
||||
BLASLONG fahead = 30;
|
||||
BLASLONG spec_unroll = 2;
|
||||
BLASLONG tMQ = M - M % spec_unroll;
|
||||
BLASLONG j = 0, k = 0, jj = 0;
|
||||
|
||||
if(rALPHA == 1 && iALPHA == 0) {
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
/*loop_mark*/ spec_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop_alpha1;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
/*loop_mark*/ norm_loop_alpha1;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop_alpha1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
FLOAT rTmp, iTmp;
|
||||
if(INCX == 1) {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[ii + fahead]);
|
||||
/*loop_mark*/ spec_loop;
|
||||
/*loop_mark*/ spec_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
spec_loop;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||
BLASLONG i = 0, ii = 0, iii = 0;
|
||||
for(; likely(i < tMQ); i += spec_unroll) {
|
||||
prefetch(A[jj + ii + fahead]);
|
||||
prefetch(X[iii + fahead]);
|
||||
/*loop_mark*/ norm_loop;
|
||||
/*loop_mark*/ norm_loop;
|
||||
}
|
||||
for(; likely(i < M); i++) {
|
||||
norm_loop;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
38
param.h
38
param.h
|
@ -1480,31 +1480,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_P 32
|
||||
#define DGEMM_DEFAULT_P 32
|
||||
#define CGEMM_DEFAULT_P 108
|
||||
#define ZGEMM_DEFAULT_P 112
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define SGEMM_DEFAULT_Q 116
|
||||
#define DGEMM_DEFAULT_Q 116
|
||||
#define CGEMM_DEFAULT_Q 144
|
||||
#define ZGEMM_DEFAULT_Q 72
|
||||
#define SGEMM_DEFAULT_P 64
|
||||
#define DGEMM_DEFAULT_P 44
|
||||
#define CGEMM_DEFAULT_P 64
|
||||
#define ZGEMM_DEFAULT_P 32
|
||||
|
||||
#define SGEMM_DEFAULT_R 1000
|
||||
#define DGEMM_DEFAULT_R 1000
|
||||
#define CGEMM_DEFAULT_R 2000
|
||||
#define ZGEMM_DEFAULT_R 2000
|
||||
#define SGEMM_DEFAULT_Q 192
|
||||
#define DGEMM_DEFAULT_Q 92
|
||||
#define CGEMM_DEFAULT_Q 128
|
||||
#define ZGEMM_DEFAULT_Q 80
|
||||
|
||||
#define SGEMM_DEFAULT_R 1024
|
||||
#define DGEMM_DEFAULT_R dgemm_r
|
||||
#define CGEMM_DEFAULT_R 1024
|
||||
#define ZGEMM_DEFAULT_R 1024
|
||||
|
||||
#define GEMM_OFFSET_A1 0x10000
|
||||
#define GEMM_OFFSET_B1 0x100000
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
|
|
@ -1301,6 +1301,8 @@
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
|
@ -1303,6 +1303,8 @@
|
|||
NC = 0
|
||||
RESET = .TRUE.
|
||||
ERRMAX = RZERO
|
||||
RALS = RONE
|
||||
RBETS = RONE
|
||||
*
|
||||
DO 100 IN = 1, NIDIM
|
||||
N = IDIM( IN )
|
||||
|
|
Loading…
Reference in New Issue