Merge branch 'loongson3a' into release-0.1.0
This commit is contained in:
commit
83ecfbb9b3
1
README
1
README
|
@ -72,6 +72,7 @@ Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD ve
|
||||||
9.Known Issues
|
9.Known Issues
|
||||||
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit
|
||||||
is 64. On 32 bits, it is 32.
|
is 64. On 32 bits, it is 32.
|
||||||
|
* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS.
|
||||||
|
|
||||||
10. Specification of Git Branches
|
10. Specification of Git Branches
|
||||||
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/).
|
||||||
|
|
|
@ -2127,7 +2127,9 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef ASSEMBLER
|
#ifndef ASSEMBLER
|
||||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||||
|
extern BLASLONG gemm_offset_a;
|
||||||
|
extern BLASLONG gemm_offset_b;
|
||||||
extern BLASLONG sgemm_p;
|
extern BLASLONG sgemm_p;
|
||||||
extern BLASLONG sgemm_q;
|
extern BLASLONG sgemm_q;
|
||||||
extern BLASLONG sgemm_r;
|
extern BLASLONG sgemm_r;
|
||||||
|
|
|
@ -152,6 +152,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define CMPEQ c.eq.d
|
#define CMPEQ c.eq.d
|
||||||
#define CMPLE c.le.d
|
#define CMPLE c.le.d
|
||||||
#define CMPLT c.lt.d
|
#define CMPLT c.lt.d
|
||||||
|
#define NEG neg.d
|
||||||
#else
|
#else
|
||||||
#define LD lwc1
|
#define LD lwc1
|
||||||
#define ST swc1
|
#define ST swc1
|
||||||
|
@ -170,6 +171,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
||||||
#define CMPEQ c.eq.s
|
#define CMPEQ c.eq.s
|
||||||
#define CMPLE c.le.s
|
#define CMPLE c.le.s
|
||||||
#define CMPLT c.lt.s
|
#define CMPLT c.lt.s
|
||||||
|
#define PLU plu.ps
|
||||||
|
#define PLL pll.ps
|
||||||
|
#define PUU puu.ps
|
||||||
|
#define PUL pul.ps
|
||||||
|
#define MADPS madd.ps
|
||||||
|
#define CVTU cvt.s.pu
|
||||||
|
#define CVTL cvt.s.pl
|
||||||
|
#define NEG neg.s
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__64BIT__) && defined(USE64BITINT)
|
#if defined(__64BIT__) && defined(USE64BITINT)
|
||||||
|
@ -218,7 +227,7 @@ REALNAME: ;\
|
||||||
|
|
||||||
#define SEEK_ADDRESS
|
#define SEEK_ADDRESS
|
||||||
|
|
||||||
#define BUFFER_SIZE ( 8 << 20)
|
#define BUFFER_SIZE ( 32 << 20)
|
||||||
|
|
||||||
#if defined(LOONGSON3A)
|
#if defined(LOONGSON3A)
|
||||||
#define PAGESIZE (16UL << 10)
|
#define PAGESIZE (16UL << 10)
|
||||||
|
|
|
@ -71,16 +71,25 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
||||||
queue[num_cpu].args = arg;
|
queue[num_cpu].args = arg;
|
||||||
queue[num_cpu].range_m = range_m;
|
queue[num_cpu].range_m = range_m;
|
||||||
queue[num_cpu].range_n = &range[num_cpu];
|
queue[num_cpu].range_n = &range[num_cpu];
|
||||||
queue[num_cpu].sa = NULL;
|
#if defined(LOONGSON3A)
|
||||||
|
queue[num_cpu].sa = sa + GEMM_OFFSET_A1 * num_cpu;
|
||||||
|
queue[num_cpu].sb = queue[num_cpu].sa + GEMM_OFFSET_A1 * 5;
|
||||||
|
#else
|
||||||
|
queue[num_cpu].sa = NULL;
|
||||||
queue[num_cpu].sb = NULL;
|
queue[num_cpu].sb = NULL;
|
||||||
|
#endif
|
||||||
queue[num_cpu].next = &queue[num_cpu + 1];
|
queue[num_cpu].next = &queue[num_cpu + 1];
|
||||||
num_cpu ++;
|
num_cpu ++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (num_cpu) {
|
if (num_cpu) {
|
||||||
|
#if defined(LOONGSON3A)
|
||||||
queue[0].sa = sa;
|
queue[0].sa = sa;
|
||||||
queue[0].sb = sb;
|
queue[0].sb = sa + GEMM_OFFSET_A1 * 5;
|
||||||
|
#else
|
||||||
|
queue[0].sa = sa;
|
||||||
|
queue[0].sb = sb;
|
||||||
|
#endif
|
||||||
queue[num_cpu - 1].next = NULL;
|
queue[num_cpu - 1].next = NULL;
|
||||||
|
|
||||||
exec_blas(num_cpu,
|
exec_blas(num_cpu,
|
||||||
|
|
|
@ -500,6 +500,7 @@ static int blas_monitor(void *arg){
|
||||||
/* Initializing routine */
|
/* Initializing routine */
|
||||||
int blas_thread_init(void){
|
int blas_thread_init(void){
|
||||||
BLASLONG i;
|
BLASLONG i;
|
||||||
|
int ret;
|
||||||
#ifdef NEED_STACKATTR
|
#ifdef NEED_STACKATTR
|
||||||
pthread_attr_t attr;
|
pthread_attr_t attr;
|
||||||
#endif
|
#endif
|
||||||
|
@ -545,12 +546,16 @@ int blas_thread_init(void){
|
||||||
pthread_cond_init (&thread_status[i].wakeup, NULL);
|
pthread_cond_init (&thread_status[i].wakeup, NULL);
|
||||||
|
|
||||||
#ifdef NEED_STACKATTR
|
#ifdef NEED_STACKATTR
|
||||||
pthread_create(&blas_threads[i], &attr,
|
ret=pthread_create(&blas_threads[i], &attr,
|
||||||
(void *)&blas_thread_server, (void *)i);
|
(void *)&blas_thread_server, (void *)i);
|
||||||
#else
|
#else
|
||||||
pthread_create(&blas_threads[i], NULL,
|
ret=pthread_create(&blas_threads[i], NULL,
|
||||||
(void *)&blas_thread_server, (void *)i);
|
(void *)&blas_thread_server, (void *)i);
|
||||||
#endif
|
#endif
|
||||||
|
if(ret!=0){
|
||||||
|
fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef MONITOR
|
#ifdef MONITOR
|
||||||
|
@ -797,6 +802,11 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
blas_cpu_number = num_threads;
|
blas_cpu_number = num_threads;
|
||||||
|
|
||||||
|
#if defined(ARCH_MIPS64)
|
||||||
|
//set parameters for different number of threads.
|
||||||
|
blas_set_parameter();
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void openblas_set_num_threads(int num_threads) {
|
void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
|
@ -63,6 +63,11 @@ void goto_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
omp_set_num_threads(blas_cpu_number);
|
omp_set_num_threads(blas_cpu_number);
|
||||||
|
|
||||||
|
#if defined(ARCH_MIPS64)
|
||||||
|
//set parameters for different number of threads.
|
||||||
|
blas_set_parameter();
|
||||||
|
#endif
|
||||||
|
|
||||||
}
|
}
|
||||||
void openblas_set_num_threads(int num_threads) {
|
void openblas_set_num_threads(int num_threads) {
|
||||||
|
|
||||||
|
|
|
@ -884,7 +884,7 @@ void *blas_memory_alloc(int procpos){
|
||||||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64)
|
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64)
|
||||||
#ifndef DYNAMIC_ARCH
|
#ifndef DYNAMIC_ARCH
|
||||||
blas_set_parameter();
|
blas_set_parameter();
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -45,8 +45,22 @@ int get_L2_size(void);
|
||||||
#define DEFAULT_GEMM_P 128
|
#define DEFAULT_GEMM_P 128
|
||||||
#define DEFAULT_GEMM_Q 128
|
#define DEFAULT_GEMM_Q 128
|
||||||
#define DEFAULT_GEMM_R 128
|
#define DEFAULT_GEMM_R 128
|
||||||
|
#define DEFAULT_GEMM_OFFSET_A 0
|
||||||
|
#define DEFAULT_GEMM_OFFSET_B 0
|
||||||
|
|
||||||
/* Global Parameter */
|
/* Global Parameter */
|
||||||
|
#if GEMM_OFFSET_A == gemm_offset_a
|
||||||
|
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
|
||||||
|
#else
|
||||||
|
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if GEMM_OFFSET_B == gemm_offset_b
|
||||||
|
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
||||||
|
#else
|
||||||
|
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||||
|
#endif
|
||||||
|
|
||||||
#if SGEMM_P == sgemm_p
|
#if SGEMM_P == sgemm_p
|
||||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||||
#else
|
#else
|
||||||
|
@ -666,3 +680,21 @@ void blas_set_parameter(void){
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(ARCH_MIPS64)
|
||||||
|
void blas_set_parameter(void){
|
||||||
|
#if defined(LOONGSON3A)
|
||||||
|
#ifdef SMP
|
||||||
|
if(blas_num_threads == 1){
|
||||||
|
#endif
|
||||||
|
//single thread
|
||||||
|
dgemm_r = 1024;
|
||||||
|
#ifdef SMP
|
||||||
|
}else{
|
||||||
|
//multi thread
|
||||||
|
dgemm_r = 200;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
|
@ -136,6 +136,7 @@ void NAME(char *SIDE, char *UPLO,
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
|
@ -143,6 +144,15 @@ void NAME(char *SIDE, char *UPLO,
|
||||||
#else
|
#else
|
||||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
#endif
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||||
|
#else
|
||||||
|
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||||
|
@ -237,6 +247,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
||||||
FLOAT *sa, *sb;
|
FLOAT *sa, *sb;
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
|
#ifndef COMPLEX
|
||||||
#ifdef XDOUBLE
|
#ifdef XDOUBLE
|
||||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||||
#elif defined(DOUBLE)
|
#elif defined(DOUBLE)
|
||||||
|
@ -244,6 +255,15 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
||||||
#else
|
#else
|
||||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||||
#endif
|
#endif
|
||||||
|
#else
|
||||||
|
#ifdef XDOUBLE
|
||||||
|
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||||
|
#elif defined(DOUBLE)
|
||||||
|
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||||
|
#else
|
||||||
|
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||||
|
|
|
@ -123,15 +123,37 @@ ifndef DTRSMKERNEL_RT
|
||||||
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
DTRSMKERNEL_RT = trsm_kernel_RT.S
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifndef CTRSMKERNEL_LN
|
||||||
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||||
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
endif
|
||||||
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
|
||||||
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
|
||||||
|
|
||||||
|
ifndef CTRSMKERNEL_LT
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef CTRSMKERNEL_RN
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef CTRSMKERNEL_RT
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef ZTRSMKERNEL_LN
|
||||||
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef ZTRSMKERNEL_LT
|
||||||
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef ZTRSMKERNEL_RN
|
||||||
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
|
||||||
|
endif
|
||||||
|
|
||||||
|
ifndef ZTRSMKERNEL_RT
|
||||||
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
|
||||||
|
endif
|
||||||
|
|
||||||
CGEMM3MKERNEL = zgemm3m_kernel.S
|
CGEMM3MKERNEL = zgemm3m_kernel.S
|
||||||
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
ZGEMM3MKERNEL = zgemm3m_kernel.S
|
||||||
|
|
|
@ -1,18 +1,48 @@
|
||||||
SAXPYKERNEL=axpy_loongson3a.S
|
SAXPYKERNEL=axpy_loongson3a.S
|
||||||
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
DAXPYKERNEL=daxpy_loongson3a_simd.S
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_loongson3a.S
|
SGEMVNKERNEL = gemv_n_loongson3a.c
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
SGEMVTKERNEL = gemv_t_loongson3a.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
DGEMVNKERNEL = gemv_n_loongson3a.c
|
||||||
|
DGEMVTKERNEL = gemv_t_loongson3a.c
|
||||||
|
CGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||||
|
CGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||||
|
ZGEMVNKERNEL = zgemv_n_loongson3a.c
|
||||||
|
ZGEMVTKERNEL = zgemv_t_loongson3a.c
|
||||||
|
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy.o
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy.o
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||||
|
|
||||||
DGEMMKERNEL = gemm_kernel_loongson3a.S
|
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||||
|
|
||||||
|
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||||
|
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
@ -22,3 +52,17 @@ DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,101 @@
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
//These are auto-tuning codes on Loongson-3A platform.
|
||||||
|
|
||||||
|
//#define prefetch(x) __builtin_prefetch(x)
|
||||||
|
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||||
|
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||||
|
#define likely(x) __builtin_expect(!!(x), 1)
|
||||||
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||||
|
|
||||||
|
#define spec_loop_alpha1 do {Y[i] += A[LDA * j + i] * X[k]; i++;} while(0)
|
||||||
|
#define spec_loop do {Y[i] += ALPHA * A[LDA * j + i] * X[k]; i++;} while(0)
|
||||||
|
#define norm_loop_alpha1 do {Y[h] += A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||||
|
#define norm_loop do {Y[h] += ALPHA * A[LDA * j + i] * X[k]; i++; h += INCY;} while(0)
|
||||||
|
|
||||||
|
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG kx=0, ky=0;
|
||||||
|
if(!ALPHA)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
//if(INCX < 0)
|
||||||
|
// kx = (1-N) * INCX;
|
||||||
|
// INCX = -INCX;
|
||||||
|
//if(INCY < 0)
|
||||||
|
// ky = (1-M) * INCY;
|
||||||
|
// INCY = -INCY;
|
||||||
|
|
||||||
|
BLASLONG fahead = 30;
|
||||||
|
BLASLONG spec_unroll = 4;
|
||||||
|
BLASLONG tMQ = M - M % spec_unroll;
|
||||||
|
BLASLONG j = 0, k = 0;
|
||||||
|
|
||||||
|
if(ALPHA == 1) {
|
||||||
|
if(INCY == 1) {
|
||||||
|
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(Y[i + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||||
|
BLASLONG i = 0, h = ky;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(Y[h + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(INCY == 1) {
|
||||||
|
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(Y[i + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
spec_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(k=kx; likely(j < N); j++, k += INCX) {
|
||||||
|
BLASLONG i = 0, h = ky;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(Y[h + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
norm_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,93 @@
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
//These are auto-tuning codes on Loongson-3A platform.
|
||||||
|
|
||||||
|
//#define prefetch(x) __builtin_prefetch(x)
|
||||||
|
//#define prefetch(x) do {_mm_prefetch((char *)(x), _MM_HINT_T0);} while(0)
|
||||||
|
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||||
|
#define likely(x) __builtin_expect(!!(x), 1)
|
||||||
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||||
|
|
||||||
|
#define spec_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[i]; i++;} while(0)
|
||||||
|
#define spec_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[i]; i++;} while(0)
|
||||||
|
#define norm_loop_alpha1 do {Y[k] += A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||||
|
#define norm_loop do {Y[k] += ALPHA * A[LDA * j + i] * X[h]; i++; h += INCX;} while(0)
|
||||||
|
|
||||||
|
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT ALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||||
|
|
||||||
|
if(!ALPHA)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
// if(INCX < 0)
|
||||||
|
// INCX = -INCX;
|
||||||
|
// if(INCY < 0)
|
||||||
|
// INCY = -INCY;
|
||||||
|
|
||||||
|
BLASLONG fahead = 30;
|
||||||
|
BLASLONG spec_unroll = 3;
|
||||||
|
BLASLONG tMQ = M - M % spec_unroll;
|
||||||
|
BLASLONG j = 0, k = 0;
|
||||||
|
|
||||||
|
if(ALPHA == 1) {
|
||||||
|
if(INCX == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCY) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(X[i + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCY) {
|
||||||
|
BLASLONG i = 0, h = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(X[h + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if(INCX == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCY) {
|
||||||
|
BLASLONG i = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(X[i + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
spec_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCY) {
|
||||||
|
BLASLONG i = 0, h = 0;
|
||||||
|
for(; likely(i < tMQ);) {
|
||||||
|
prefetch(A[LDA * j + i + fahead]);
|
||||||
|
prefetch(X[h + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M);) {
|
||||||
|
norm_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,139 @@
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
//typedef int BLASLONG;
|
||||||
|
//typedef double FLOAT;
|
||||||
|
|
||||||
|
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||||
|
#define likely(x) __builtin_expect(!!(x), 1)
|
||||||
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||||
|
|
||||||
|
#if !defined(CONJ) && !defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||||
|
#define spec_loop spec_loop_0
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||||
|
#define norm_loop norm_loop_0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(CONJ) && !defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||||
|
#define spec_loop spec_loop_1
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||||
|
#define norm_loop norm_loop_1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(CONJ) && defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||||
|
#define spec_loop spec_loop_2
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||||
|
#define norm_loop norm_loop_2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(CONJ) && defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||||
|
#define spec_loop spec_loop_3
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||||
|
#define norm_loop norm_loop_3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define spec_loop_alpha1_0 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_alpha1_1 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] += A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_alpha1_2 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] += A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] += A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_alpha1_3 do {Y[ii] += A[jj + ii] * X[k]; Y[ii + 1] -= A[jj + ii + 1] * X[k]; Y[ii + 1] -= A[jj + ii] * X[k + 1]; Y[ii] -= A[jj + ii + 1] * X[k + 1]; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[ii] += rTmp * rALPHA - iTmp * iALPHA; Y[ii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_alpha1_0 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_alpha1_1 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_alpha1_2 do {Y[iii] += A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_alpha1_3 do {Y[iii] += A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; Y[iii + 1] += -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_0 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_1 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_2 do {rTmp = A[jj + ii] * X[k] + A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] + A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_3 do {rTmp = A[jj + ii] * X[k] - A[jj + ii + 1] * X[k + 1]; iTmp = -A[jj + ii] * X[k + 1] - A[jj + ii + 1] * X[k]; Y[iii] += rTmp * rALPHA - iTmp * iALPHA; Y[iii + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCY * 2;} while(0)
|
||||||
|
|
||||||
|
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||||
|
|
||||||
|
if(!rALPHA && iALPHA)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
BLASLONG fahead = 60;
|
||||||
|
BLASLONG spec_unroll = 2;
|
||||||
|
BLASLONG tMQ = M - M % spec_unroll;
|
||||||
|
BLASLONG j = 0, k = 0, jj = 0;
|
||||||
|
|
||||||
|
if(rALPHA == 1 && iALPHA == 0) {
|
||||||
|
if(INCY == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(Y[ii + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0, iii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(Y[iii + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
FLOAT rTmp, iTmp;
|
||||||
|
if(INCY == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(Y[ii + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
spec_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCX * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0, iii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(Y[iii + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
norm_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#define prefetch(x) __asm__ __volatile__("ld $0, %0"::"m"(x))
|
||||||
|
#define likely(x) __builtin_expect(!!(x), 1)
|
||||||
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||||
|
|
||||||
|
#if !defined(CONJ) && !defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_0
|
||||||
|
#define spec_loop spec_loop_0
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_0
|
||||||
|
#define norm_loop norm_loop_0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(CONJ) && !defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_1
|
||||||
|
#define spec_loop spec_loop_1
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_1
|
||||||
|
#define norm_loop norm_loop_1
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(CONJ) && defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_2
|
||||||
|
#define spec_loop spec_loop_2
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_2
|
||||||
|
#define norm_loop norm_loop_2
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(CONJ) && defined(XCONJ)
|
||||||
|
#define spec_loop_alpha1 spec_loop_alpha1_3
|
||||||
|
#define spec_loop spec_loop_3
|
||||||
|
#define norm_loop_alpha1 norm_loop_alpha1_3
|
||||||
|
#define norm_loop norm_loop_3
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#define spec_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||||
|
#define spec_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] += A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||||
|
#define spec_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] += A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] += A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||||
|
#define spec_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[ii]; Y[k + 1] -= A[jj + ii + 1] * X[ii]; Y[k + 1] -= A[jj + ii] * X[ii + 1]; Y[k] -= A[jj + ii + 1] * X[ii + 1]; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define spec_loop_0 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
#define spec_loop_1 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
#define spec_loop_2 do {rTmp = A[jj + ii] * X[ii] + A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] + A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
#define spec_loop_3 do {rTmp = A[jj + ii] * X[ii] - A[jj + ii + 1] * X[ii + 1]; iTmp = -A[jj + ii] * X[ii + 1] - A[jj + ii + 1] * X[ii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_alpha1_0 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_alpha1_1 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_alpha1_2 do {Y[k] += A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_alpha1_3 do {Y[k] += A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; Y[k + 1] += -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
|
||||||
|
#define norm_loop_0 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_1 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_2 do {rTmp = A[jj + ii] * X[iii] + A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] + A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
#define norm_loop_3 do {rTmp = A[jj + ii] * X[iii] - A[jj + ii + 1] * X[iii + 1]; iTmp = -A[jj + ii] * X[iii + 1] - A[jj + ii + 1] * X[iii]; Y[k] += rTmp * rALPHA - iTmp * iALPHA; Y[k + 1] += rTmp * iALPHA + iTmp * rALPHA; ii += 2; iii += INCX * 2;} while(0)
|
||||||
|
|
||||||
|
int CNAME(BLASLONG M, BLASLONG N, BLASLONG UNUSED, FLOAT rALPHA, FLOAT iALPHA, FLOAT *A, BLASLONG LDA, FLOAT *X, BLASLONG INCX, FLOAT *Y, BLASLONG INCY, FLOAT *BUFFER) {
|
||||||
|
|
||||||
|
if(!rALPHA && iALPHA)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
BLASLONG fahead = 30;
|
||||||
|
BLASLONG spec_unroll = 2;
|
||||||
|
BLASLONG tMQ = M - M % spec_unroll;
|
||||||
|
BLASLONG j = 0, k = 0, jj = 0;
|
||||||
|
|
||||||
|
if(rALPHA == 1 && iALPHA == 0) {
|
||||||
|
if(INCX == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(X[ii + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
/*loop_mark*/ spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
spec_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0, iii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(X[iii + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
/*loop_mark*/ norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
norm_loop_alpha1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
FLOAT rTmp, iTmp;
|
||||||
|
if(INCX == 1) {
|
||||||
|
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(X[ii + fahead]);
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
/*loop_mark*/ spec_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
spec_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for(; likely(j < N); j++, k += INCY * 2, jj += LDA * 2) {
|
||||||
|
BLASLONG i = 0, ii = 0, iii = 0;
|
||||||
|
for(; likely(i < tMQ); i += spec_unroll) {
|
||||||
|
prefetch(A[jj + ii + fahead]);
|
||||||
|
prefetch(X[iii + fahead]);
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
/*loop_mark*/ norm_loop;
|
||||||
|
}
|
||||||
|
for(; likely(i < M); i++) {
|
||||||
|
norm_loop;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
38
param.h
38
param.h
|
@ -1480,31 +1480,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define GEMM_DEFAULT_OFFSET_B 0
|
#define GEMM_DEFAULT_OFFSET_B 0
|
||||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 1
|
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 32
|
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define DGEMM_DEFAULT_P 32
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define CGEMM_DEFAULT_P 108
|
|
||||||
#define ZGEMM_DEFAULT_P 112
|
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 116
|
#define SGEMM_DEFAULT_P 64
|
||||||
#define DGEMM_DEFAULT_Q 116
|
#define DGEMM_DEFAULT_P 44
|
||||||
#define CGEMM_DEFAULT_Q 144
|
#define CGEMM_DEFAULT_P 64
|
||||||
#define ZGEMM_DEFAULT_Q 72
|
#define ZGEMM_DEFAULT_P 32
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_R 1000
|
#define SGEMM_DEFAULT_Q 192
|
||||||
#define DGEMM_DEFAULT_R 1000
|
#define DGEMM_DEFAULT_Q 92
|
||||||
#define CGEMM_DEFAULT_R 2000
|
#define CGEMM_DEFAULT_Q 128
|
||||||
#define ZGEMM_DEFAULT_R 2000
|
#define ZGEMM_DEFAULT_Q 80
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_R 1024
|
||||||
|
#define DGEMM_DEFAULT_R dgemm_r
|
||||||
|
#define CGEMM_DEFAULT_R 1024
|
||||||
|
#define ZGEMM_DEFAULT_R 1024
|
||||||
|
|
||||||
|
#define GEMM_OFFSET_A1 0x10000
|
||||||
|
#define GEMM_OFFSET_B1 0x100000
|
||||||
|
|
||||||
#define SYMV_P 16
|
#define SYMV_P 16
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1301,6 +1301,8 @@
|
||||||
NC = 0
|
NC = 0
|
||||||
RESET = .TRUE.
|
RESET = .TRUE.
|
||||||
ERRMAX = RZERO
|
ERRMAX = RZERO
|
||||||
|
RALS = RONE
|
||||||
|
RBETS = RONE
|
||||||
*
|
*
|
||||||
DO 100 IN = 1, NIDIM
|
DO 100 IN = 1, NIDIM
|
||||||
N = IDIM( IN )
|
N = IDIM( IN )
|
||||||
|
|
|
@ -1303,6 +1303,8 @@
|
||||||
NC = 0
|
NC = 0
|
||||||
RESET = .TRUE.
|
RESET = .TRUE.
|
||||||
ERRMAX = RZERO
|
ERRMAX = RZERO
|
||||||
|
RALS = RONE
|
||||||
|
RBETS = RONE
|
||||||
*
|
*
|
||||||
DO 100 IN = 1, NIDIM
|
DO 100 IN = 1, NIDIM
|
||||||
N = IDIM( IN )
|
N = IDIM( IN )
|
||||||
|
|
Loading…
Reference in New Issue