Compare commits

...

10 Commits

Author SHA1 Message Date
Zhang Xianyi 77460ac255 Fix gemm_batch bug for SMALL_MATRIX_OPT=1. 2020-12-14 09:50:47 +08:00
Zhang Xianyi 88e6806e3f Init cblas_?gemm_batch implementation. 2020-12-14 09:50:32 +08:00
Xianyi Zhang 4130d1732e Refs #2587 fix small matrix c/zgemm bug. 2020-08-28 22:36:36 +08:00
Xianyi Zhang 255b6dd0fa Merge branch 'develop' into small_matrices 2020-08-28 21:38:58 +08:00
Xianyi Zhang 741d6c5cb8 Refs #2587 Add small matrix optimization reference kernel for c/zgemm. 2020-08-28 21:00:54 +08:00
Xianyi Zhang 712ca43069 Change a1b0 gemm to b0 gemm. 2020-08-28 07:55:27 +08:00
Xianyi Zhang 9d3a317abc Refs #2587 Fix typos. 2020-04-29 00:19:19 +08:00
Xianyi Zhang 92372c70fc Fix gemm interface bug for small matrix. 2020-04-28 23:15:20 +08:00
Xianyi Zhang 43bef4aaac Add alpha=1.0 beta=0.0 for small gemm. 2020-04-28 22:35:36 +08:00
Xianyi Zhang aae6af94bb Add small marix optimization kernel interface.
make SMALL_MATRIX_OPT=1
2020-04-28 19:02:41 +08:00
31 changed files with 2410 additions and 11 deletions

View File

@ -224,6 +224,11 @@ else
ONLY_CBLAS = 0 ONLY_CBLAS = 0
endif endif
#For small matrix optimization
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
endif
# This operation is expensive, so execution should be once. # This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1 export GOTOBLAS_MAKEFILE = 1

11
cblas.h
View File

@ -382,6 +382,17 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc); double *c, OPENBLAS_CONST blasint cldc);
void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -232,6 +232,46 @@
#define CGEADD_K cgeadd_k #define CGEADD_K cgeadd_k
#define CGEMM_SMALL_KERNEL_NN cgemm_small_kernel_nn
#define CGEMM_SMALL_KERNEL_NT cgemm_small_kernel_nt
#define CGEMM_SMALL_KERNEL_NR cgemm_small_kernel_nr
#define CGEMM_SMALL_KERNEL_NC cgemm_small_kernel_nc
#define CGEMM_SMALL_KERNEL_TN cgemm_small_kernel_tn
#define CGEMM_SMALL_KERNEL_TT cgemm_small_kernel_tt
#define CGEMM_SMALL_KERNEL_TR cgemm_small_kernel_tr
#define CGEMM_SMALL_KERNEL_TC cgemm_small_kernel_tc
#define CGEMM_SMALL_KERNEL_RN cgemm_small_kernel_rn
#define CGEMM_SMALL_KERNEL_RT cgemm_small_kernel_rt
#define CGEMM_SMALL_KERNEL_RR cgemm_small_kernel_rr
#define CGEMM_SMALL_KERNEL_RC cgemm_small_kernel_rc
#define CGEMM_SMALL_KERNEL_CN cgemm_small_kernel_cn
#define CGEMM_SMALL_KERNEL_CT cgemm_small_kernel_ct
#define CGEMM_SMALL_KERNEL_CR cgemm_small_kernel_cr
#define CGEMM_SMALL_KERNEL_CC cgemm_small_kernel_cc
#define CGEMM_SMALL_KERNEL_B0_NN cgemm_small_kernel_b0_nn
#define CGEMM_SMALL_KERNEL_B0_NT cgemm_small_kernel_b0_nt
#define CGEMM_SMALL_KERNEL_B0_NR cgemm_small_kernel_b0_nr
#define CGEMM_SMALL_KERNEL_B0_NC cgemm_small_kernel_b0_nc
#define CGEMM_SMALL_KERNEL_B0_TN cgemm_small_kernel_b0_tn
#define CGEMM_SMALL_KERNEL_B0_TT cgemm_small_kernel_b0_tt
#define CGEMM_SMALL_KERNEL_B0_TR cgemm_small_kernel_b0_tr
#define CGEMM_SMALL_KERNEL_B0_TC cgemm_small_kernel_b0_tc
#define CGEMM_SMALL_KERNEL_B0_RN cgemm_small_kernel_b0_rn
#define CGEMM_SMALL_KERNEL_B0_RT cgemm_small_kernel_b0_rt
#define CGEMM_SMALL_KERNEL_B0_RR cgemm_small_kernel_b0_rr
#define CGEMM_SMALL_KERNEL_B0_RC cgemm_small_kernel_b0_rc
#define CGEMM_SMALL_KERNEL_B0_CN cgemm_small_kernel_b0_cn
#define CGEMM_SMALL_KERNEL_B0_CT cgemm_small_kernel_b0_ct
#define CGEMM_SMALL_KERNEL_B0_CR cgemm_small_kernel_b0_cr
#define CGEMM_SMALL_KERNEL_B0_CC cgemm_small_kernel_b0_cc
#else #else
#define CAMAX_K gotoblas -> camax_k #define CAMAX_K gotoblas -> camax_k

View File

@ -157,6 +157,17 @@
#define DIMATCOPY_K_RT dimatcopy_k_rt #define DIMATCOPY_K_RT dimatcopy_k_rt
#define DGEADD_K dgeadd_k #define DGEADD_K dgeadd_k
#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn
#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt
#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn
#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt
#define DGEMM_SMALL_KERNEL_B0_NN dgemm_small_kernel_b0_nn
#define DGEMM_SMALL_KERNEL_B0_NT dgemm_small_kernel_b0_nt
#define DGEMM_SMALL_KERNEL_B0_TN dgemm_small_kernel_b0_tn
#define DGEMM_SMALL_KERNEL_B0_TT dgemm_small_kernel_b0_tt
#else #else
#define DAMAX_K gotoblas -> damax_k #define DAMAX_K gotoblas -> damax_k

View File

@ -515,6 +515,109 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
#endif #endif
#ifdef SMALL_MATRIX_OPT
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc);
int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc);
#endif
int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
@ -1816,6 +1919,10 @@ int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BL
int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG);
int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG);
int sgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
int dgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
int cgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
int zgemm_batch_thread(blas_arg_t * queue, BLASLONG nums);
#ifdef __CUDACC__ #ifdef __CUDACC__
} }

View File

@ -644,6 +644,16 @@
#define GEADD_K DGEADD_K #define GEADD_K DGEADD_K
#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT
#elif defined(HALF) #elif defined(HALF)
#define AMAX_K SAMAX_K #define AMAX_K SAMAX_K
@ -923,6 +933,16 @@
#define GEADD_K SGEADD_K #define GEADD_K SGEADD_K
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT
#endif #endif
#else #else
@ -1228,6 +1248,17 @@
#define IMATCOPY_K_RT SIMATCOPY_K_RT #define IMATCOPY_K_RT SIMATCOPY_K_RT
#define GEADD_K SGEADD_K #define GEADD_K SGEADD_K
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT
#endif #endif
#else #else
#ifdef XDOUBLE #ifdef XDOUBLE
@ -2055,6 +2086,46 @@
#define GEADD_K ZGEADD_K #define GEADD_K ZGEADD_K
#define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR
#define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC
#define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR
#define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC
#define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN
#define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT
#define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR
#define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC
#define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN
#define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT
#define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR
#define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC
#define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR
#define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC
#define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT
#define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR
#define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC
#define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN
#define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT
#define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR
#define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC
#define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN
#define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT
#define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR
#define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC
#else #else
#define AMAX_K CAMAX_K #define AMAX_K CAMAX_K
@ -2478,6 +2549,46 @@
#define GEADD_K CGEADD_K #define GEADD_K CGEADD_K
#define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR
#define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC
#define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT
#define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR
#define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC
#define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN
#define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT
#define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR
#define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC
#define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN
#define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT
#define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR
#define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC
#define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN
#define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT
#define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR
#define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC
#define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN
#define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT
#define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR
#define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC
#define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN
#define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT
#define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR
#define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC
#define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN
#define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT
#define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR
#define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC
#endif #endif
#endif #endif
@ -2525,7 +2636,17 @@ typedef struct {
BLASLONG prea, preb, prec, pred; BLASLONG prea, preb, prec, pred;
#endif #endif
//for gemm_batch
void * routine;
int routine_mode;
} blas_arg_t; } blas_arg_t;
#ifdef SMALL_MATRIX_OPT
#define BLAS_SMALL_OPT 0x10000U
#define BLAS_SMALL_B0_OPT 0x30000U
#endif
#endif #endif
#ifdef XDOUBLE #ifdef XDOUBLE

View File

@ -164,6 +164,16 @@
#define SGEADD_K sgeadd_k #define SGEADD_K sgeadd_k
#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn
#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt
#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn
#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt
#define SGEMM_SMALL_KERNEL_B0_NN sgemm_small_kernel_b0_nn
#define SGEMM_SMALL_KERNEL_B0_NT sgemm_small_kernel_b0_nt
#define SGEMM_SMALL_KERNEL_B0_TN sgemm_small_kernel_b0_tn
#define SGEMM_SMALL_KERNEL_B0_TT sgemm_small_kernel_b0_tt
#else #else
#define SAMAX_K gotoblas -> samax_k #define SAMAX_K gotoblas -> samax_k

View File

@ -232,6 +232,46 @@
#define ZGEADD_K zgeadd_k #define ZGEADD_K zgeadd_k
#define ZGEMM_SMALL_KERNEL_NN zgemm_small_kernel_nn
#define ZGEMM_SMALL_KERNEL_NT zgemm_small_kernel_nt
#define ZGEMM_SMALL_KERNEL_NR zgemm_small_kernel_nr
#define ZGEMM_SMALL_KERNEL_NC zgemm_small_kernel_nc
#define ZGEMM_SMALL_KERNEL_TN zgemm_small_kernel_tn
#define ZGEMM_SMALL_KERNEL_TT zgemm_small_kernel_tt
#define ZGEMM_SMALL_KERNEL_TR zgemm_small_kernel_tr
#define ZGEMM_SMALL_KERNEL_TC zgemm_small_kernel_tc
#define ZGEMM_SMALL_KERNEL_RN zgemm_small_kernel_rn
#define ZGEMM_SMALL_KERNEL_RT zgemm_small_kernel_rt
#define ZGEMM_SMALL_KERNEL_RR zgemm_small_kernel_rr
#define ZGEMM_SMALL_KERNEL_RC zgemm_small_kernel_rc
#define ZGEMM_SMALL_KERNEL_CN zgemm_small_kernel_cn
#define ZGEMM_SMALL_KERNEL_CT zgemm_small_kernel_ct
#define ZGEMM_SMALL_KERNEL_CR zgemm_small_kernel_cr
#define ZGEMM_SMALL_KERNEL_CC zgemm_small_kernel_cc
#define ZGEMM_SMALL_KERNEL_B0_NN zgemm_small_kernel_b0_nn
#define ZGEMM_SMALL_KERNEL_B0_NT zgemm_small_kernel_b0_nt
#define ZGEMM_SMALL_KERNEL_B0_NR zgemm_small_kernel_b0_nr
#define ZGEMM_SMALL_KERNEL_B0_NC zgemm_small_kernel_b0_nc
#define ZGEMM_SMALL_KERNEL_B0_TN zgemm_small_kernel_b0_tn
#define ZGEMM_SMALL_KERNEL_B0_TT zgemm_small_kernel_b0_tt
#define ZGEMM_SMALL_KERNEL_B0_TR zgemm_small_kernel_b0_tr
#define ZGEMM_SMALL_KERNEL_B0_TC zgemm_small_kernel_b0_tc
#define ZGEMM_SMALL_KERNEL_B0_RN zgemm_small_kernel_b0_rn
#define ZGEMM_SMALL_KERNEL_B0_RT zgemm_small_kernel_b0_rt
#define ZGEMM_SMALL_KERNEL_B0_RR zgemm_small_kernel_b0_rr
#define ZGEMM_SMALL_KERNEL_B0_RC zgemm_small_kernel_b0_rc
#define ZGEMM_SMALL_KERNEL_B0_CN zgemm_small_kernel_b0_cn
#define ZGEMM_SMALL_KERNEL_B0_CT zgemm_small_kernel_b0_ct
#define ZGEMM_SMALL_KERNEL_B0_CR zgemm_small_kernel_b0_cr
#define ZGEMM_SMALL_KERNEL_B0_CC zgemm_small_kernel_b0_cc
#else #else
#define ZAMAX_K gotoblas -> zamax_k #define ZAMAX_K gotoblas -> zamax_k

View File

@ -37,7 +37,7 @@ SBLASOBJS += \
ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \
ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \
ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \
ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) sgemm_batch_thread.$(SUFFIX)
DBLASOBJS += \ DBLASOBJS += \
dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \
@ -53,7 +53,7 @@ DBLASOBJS += \
dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \
dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \
dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \
dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) dgemm_batch_thread.$(SUFFIX)
QBLASOBJS += \ QBLASOBJS += \
qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \
@ -103,7 +103,7 @@ CBLASOBJS += \
cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \
csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \
cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \
cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) cgemm_batch_thread.$(SUFFIX)
ZBLASOBJS += \ ZBLASOBJS += \
zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \
@ -137,7 +137,7 @@ ZBLASOBJS += \
zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \
zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \
zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \
zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) zgemm_batch_thread.$(SUFFIX)
XBLASOBJS += \ XBLASOBJS += \
@ -2888,6 +2888,18 @@ gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h
beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h
$(CC) -c $(PFLAGS) $< -o $(@F) $(CC) -c $(PFLAGS) $< -o $(@F)
sgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F)
dgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F)
cgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F)
zgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F)
shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)

View File

@ -0,0 +1,153 @@
/*****************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include "common.h"
void openblas_warning(int verbose, const char * msg);
#ifdef SMALL_MATRIX_OPT
static int inner_small_matrix_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){
int routine_mode;
#ifndef COMPLEX
int (*gemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG);
int (*gemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG);
#else
int (*zgemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG);
int (*zgemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG);
FLOAT alpha[2], beta[2];
#endif
routine_mode=args->routine_mode;
if(routine_mode & BLAS_SMALL_B0_OPT){
#ifndef COMPLEX
gemm_small_kernel_b0=args->routine;
gemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, args->c, args->ldc);
#else
zgemm_small_kernel_b0=args->routine;
alpha[0]=((FLOAT *)(args->alpha))[0];
alpha[1]=((FLOAT *)(args->alpha))[1];
zgemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, args->c, args->ldc);
#endif
}else if(routine_mode & BLAS_SMALL_OPT){
#ifndef COMPLEX
gemm_small_kernel=args->routine;
gemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, *(FLOAT *)(args->beta), args->c, args->ldc);
#else
zgemm_small_kernel=args->routine;
alpha[0]=((FLOAT *)(args->alpha))[0];
alpha[1]=((FLOAT *)(args->alpha))[1];
beta[0]=((FLOAT *)(args->beta))[0];
beta[1]=((FLOAT *)(args->beta))[1];
zgemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, beta[0], beta[1], args->c, args->ldc);
#endif
}
return 0;
}
#endif
int CNAME(blas_arg_t * args_array, BLASLONG nums){
XFLOAT *buffer;
XFLOAT *sa, *sb;
int (*routine)(blas_arg_t *, void *, void *, XFLOAT *, XFLOAT *, BLASLONG);
int i=0;
#ifdef SMP
int nthreads=1;
int current_nums;
blas_queue_t * queue=NULL;
#endif
if(nums <=0 ) return 0;
buffer = (XFLOAT *)blas_memory_alloc(0);
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
#ifdef SMP
nthreads=num_cpu_avail(3);
if(nthreads==1){
#endif
//single thread
for(i=0; i<nums; i++){
routine=args_array[i].routine;
#ifdef SMALL_MATRIX_OPT
if(args_array[i].routine_mode & BLAS_SMALL_OPT){
inner_small_matrix_thread(&args_array[i], NULL, NULL, NULL, NULL, 0);
}else{
#endif
routine(&args_array[i], NULL, NULL, sa, sb, 0);
#ifdef SMALL_MATRIX_OPT
}
#endif
}
#ifdef SMP
} else {
//multi thread
queue=(blas_queue_t *)malloc((nums+1) * sizeof(blas_queue_t));
if(queue == NULL){
openblas_warning(0, "memory alloc failed!\n");
exit(1);
}
for(i=0; i<nums; i++){
queue[i].args=&args_array[i];
queue[i].range_m=NULL;
queue[i].range_n=NULL;
queue[i].sa=NULL;
queue[i].sb=NULL;
queue[i].next=&queue[i+1];
queue[i].mode=args_array[i].routine_mode;
queue[i].routine=args_array[i].routine;
#ifdef SMALL_MATRIX_OPT
if(args_array[i].routine_mode & BLAS_SMALL_OPT){
queue[i].routine=inner_small_matrix_thread;
}
#endif
}
for(i=0; i<nums; i+=nthreads){
current_nums=((nums-i)>nthreads)? nthreads: (nums-i);
queue[i].sa=sa;
queue[i].sb=sb;
queue[i+current_nums-1].next=NULL;
exec_blas(current_nums, &queue[i]);
}
free(queue);
}
#endif
blas_memory_free(buffer);
return 0;
}

View File

@ -81,6 +81,7 @@
cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin,
cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax,
cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum,
cblas_sgemm_batch, cblas_dgemm_batch, cblas_cgemm_batch, cblas_zgemm_batch,
cblas_xerbla cblas_xerbla
); );

View File

@ -278,7 +278,7 @@ CSBLAS2OBJS = \
CSBLAS3OBJS = \ CSBLAS3OBJS = \
cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(SUFFIX) cblas_sgemm_batch.$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_HALF),1)
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
@ -300,7 +300,7 @@ CDBLAS2OBJS = \
CDBLAS3OBJS += \ CDBLAS3OBJS += \
cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \
cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \
cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(SUFFIX) cblas_dgemm_batch.$(SUFFIX)
CCBLAS1OBJS = \ CCBLAS1OBJS = \
cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ cblas_icamax.$(SUFFIX) cblas_icamin.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \
@ -325,7 +325,7 @@ CCBLAS3OBJS = \
cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \
cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \
cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\
cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) cblas_cgeadd.$(SUFFIX) cblas_xerbla.$(SUFFIX) cblas_cgemm_batch.$(SUFFIX)
@ -353,7 +353,7 @@ CZBLAS3OBJS = \
cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \
cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\
cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \
cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(SUFFIX) cblas_zgemm_batch.$(SUFFIX)
ifeq ($(SUPPORT_GEMM3M), 1) ifeq ($(SUPPORT_GEMM3M), 1)
@ -2236,3 +2236,15 @@ cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c
cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c cblas_xerbla.$(SUFFIX) cblas_xerbla.$(PSUFFIX) : xerbla.c
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_sgemm_batch.$(SUFFIX) cblas_sgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_dgemm_batch.$(SUFFIX) cblas_dgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_cgemm_batch.$(SUFFIX) cblas_cgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)
cblas_zgemm_batch.$(SUFFIX) cblas_zgemm_batch.$(PSUFFIX) : gemm_batch.c ../param.h
$(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F)

View File

@ -103,6 +103,45 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
#endif #endif
}; };
#ifdef SMALL_MATRIX_OPT
#ifndef COMPLEX
static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
#endif
};
static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL,
#endif
};
#else
static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
#endif
};
static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
#endif
};
#endif
#endif
#ifndef CBLAS #ifndef CBLAS
void NAME(char *TRANSA, char *TRANSB, void NAME(char *TRANSA, char *TRANSB,
@ -122,8 +161,11 @@ void NAME(char *TRANSA, char *TRANSB,
IFLOAT *buffer; IFLOAT *buffer;
IFLOAT *sa, *sb; IFLOAT *sa, *sb;
#ifdef SMP #if defined (SMP) || defined(SMALL_MATRIX_OPT)
double MNK; double MNK;
#endif
#ifdef SMP
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL; int mode = BLAS_XDOUBLE | BLAS_REAL;
@ -244,8 +286,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
XFLOAT *buffer; XFLOAT *buffer;
XFLOAT *sa, *sb; XFLOAT *sa, *sb;
#ifdef SMP #if defined (SMP) || defined(SMALL_MATRIX_OPT)
double MNK; double MNK;
#endif
#ifdef SMP
#ifndef COMPLEX #ifndef COMPLEX
#ifdef XDOUBLE #ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL; int mode = BLAS_XDOUBLE | BLAS_REAL;
@ -411,6 +456,32 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FUNCTION_PROFILE_START(); FUNCTION_PROFILE_START();
#if defined(SMP) || defined(SMALL_MATRIX_OPT)
MNK = (double) args.m * (double) args.n * (double) args.k;
#endif
#ifdef SMALL_MATRIX_OPT
//need to tune small matrices cases.
if(MNK <= 100.0*100.0*100.0){
#if !defined(COMPLEX)
if(*(FLOAT *)(args.beta) == 0.0){
(gemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, args.c, args.ldc);
}else{
(gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b, args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
}
#else
if(beta[0] == 0.0 && beta[1] == 0.0){
(zgemm_small_kernel_b0[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, args.c, args.ldc);
}else{
(zgemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, alpha[0], alpha[1], args.b, args.ldb, beta[0], beta[1], args.c, args.ldc);
}
#endif
return;
}
#endif
buffer = (XFLOAT *)blas_memory_alloc(0); buffer = (XFLOAT *)blas_memory_alloc(0);
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
@ -420,7 +491,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT); mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT); mode |= (transb << BLAS_TRANSB_SHIFT);
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) ) if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1; args.nthreads = 1;
else else

358
interface/gemm_batch.c Normal file
View File

@ -0,0 +1,358 @@
/*****************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
void openblas_warning(int verbose, const char * msg);
#ifndef COMPLEX
#ifdef XDOUBLE
#define ERROR_NAME "QGEMM_BATCH "
#elif defined(DOUBLE)
#define ERROR_NAME "DGEMM_BATCH "
#define GEMM_BATCH_THREAD dgemm_batch_thread
#else
#define ERROR_NAME "SGEMM_BATCH "
#define GEMM_BATCH_THREAD sgemm_batch_thread
#endif
#else
#ifdef XDOUBLE
#define ERROR_NAME "XGEMM_BATCH "
#elif defined(DOUBLE)
#define ERROR_NAME "ZGEMM_BATCH "
#define GEMM_BATCH_THREAD zgemm_batch_thread
#else
#define ERROR_NAME "CGEMM_BATCH "
#define GEMM_BATCH_THREAD cgemm_batch_thread
#endif
#endif
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = {
GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN,
GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT,
GEMM_NR, GEMM_TR, GEMM_RR, GEMM_CR,
GEMM_NC, GEMM_TC, GEMM_RC, GEMM_CC,
};
#ifdef SMALL_MATRIX_OPT
#ifndef COMPLEX
static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
#endif
};
static int (*gemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, NULL, NULL,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, NULL, NULL,
#endif
};
#else
static int (*zgemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, GEMM_SMALL_KERNEL_RN, GEMM_SMALL_KERNEL_CN,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, GEMM_SMALL_KERNEL_RT, GEMM_SMALL_KERNEL_CT,
GEMM_SMALL_KERNEL_NR, GEMM_SMALL_KERNEL_TR, GEMM_SMALL_KERNEL_RR, GEMM_SMALL_KERNEL_CR,
GEMM_SMALL_KERNEL_NC, GEMM_SMALL_KERNEL_TC, GEMM_SMALL_KERNEL_RC, GEMM_SMALL_KERNEL_CC,
#endif
};
static int (*zgemm_small_kernel_b0[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
GEMM_SMALL_KERNEL_B0_NN, GEMM_SMALL_KERNEL_B0_TN, GEMM_SMALL_KERNEL_B0_RN, GEMM_SMALL_KERNEL_B0_CN,
GEMM_SMALL_KERNEL_B0_NT, GEMM_SMALL_KERNEL_B0_TT, GEMM_SMALL_KERNEL_B0_RT, GEMM_SMALL_KERNEL_B0_CT,
GEMM_SMALL_KERNEL_B0_NR, GEMM_SMALL_KERNEL_B0_TR, GEMM_SMALL_KERNEL_B0_RR, GEMM_SMALL_KERNEL_B0_CR,
GEMM_SMALL_KERNEL_B0_NC, GEMM_SMALL_KERNEL_B0_TC, GEMM_SMALL_KERNEL_B0_RC, GEMM_SMALL_KERNEL_B0_CC,
#endif
};
#endif
#endif
void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE * transa_array, enum CBLAS_TRANSPOSE * transb_array,
blasint * m_array, blasint * n_array, blasint * k_array,
#ifndef COMPLEX
FLOAT * alpha_array,
FLOAT ** a_array, blasint * lda_array,
FLOAT ** b_array, blasint * ldb_array,
FLOAT * beta_array,
FLOAT ** c_array, blasint * ldc_array, blasint group_count, blasint * group_size) {
#else
void * valpha_array,
void ** va_array, blasint * lda_array,
void ** vb_array, blasint * ldb_array,
void * vbeta_array,
void ** vc_array, blasint * ldc_array, blasint group_count, blasint * group_size) {
FLOAT * alpha_array=(FLOAT *)valpha_array;
FLOAT * beta_array=(FLOAT *)vbeta_array;
FLOAT ** a_array=(FLOAT**)va_array;
FLOAT ** b_array=(FLOAT**)vb_array;
FLOAT ** c_array=(FLOAT**)vc_array;
#endif
blas_arg_t * args_array=NULL;
int mode=0, group_mode=0;
blasint total_num=0;
blasint i=0, j=0, matrix_idx=0, count=0;
int group_transa, group_transb;
BLASLONG group_nrowa, group_nrowb;
blasint info;
void * group_alpha, * group_beta;
BLASLONG group_m, group_n, group_k;
BLASLONG group_lda, group_ldb, group_ldc;
void * group_routine=NULL;
#ifdef SMALL_MATRIX_OPT
void * group_small_matrix_opt_routine=NULL;
#endif
#if defined (SMP) || defined(SMALL_MATRIX_OPT)
double MNK;
#endif
PRINT_DEBUG_CNAME;
for(i=0; i<group_count; i++){
total_num+=group_size[i];
}
args_array=(blas_arg_t *)malloc(total_num * sizeof(blas_arg_t));
if(args_array == NULL){
openblas_warning(0, "memory alloc failed!\n");
exit(1);
}
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_REAL;
#else
mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
for(i=0; i<group_count; matrix_idx+=group_size[i], i++){
group_alpha = (void *)&alpha_array[i * COMPSIZE];
group_beta = (void *)&beta_array[i * COMPSIZE];
group_transa = -1;
group_transb = -1;
info = 0;
if (order == CblasColMajor) {
group_m = m_array[i];
group_n = n_array[i];
group_k = k_array[i];
group_lda = lda_array[i];
group_ldb = ldb_array[i];
group_ldc = ldc_array[i];
if (transa_array[i] == CblasNoTrans) group_transa = 0;
if (transa_array[i] == CblasTrans) group_transa = 1;
#ifndef COMPLEX
if (transa_array[i] == CblasConjNoTrans) group_transa = 0;
if (transa_array[i] == CblasConjTrans) group_transa = 1;
#else
if (transa_array[i] == CblasConjNoTrans) group_transa = 2;
if (transa_array[i] == CblasConjTrans) group_transa = 3;
#endif
if (transb_array[i] == CblasNoTrans) group_transb = 0;
if (transb_array[i] == CblasTrans) group_transb = 1;
#ifndef COMPLEX
if (transb_array[i] == CblasConjNoTrans) group_transb = 0;
if (transb_array[i] == CblasConjTrans) group_transb = 1;
#else
if (transb_array[i] == CblasConjNoTrans) group_transb = 2;
if (transb_array[i] == CblasConjTrans) group_transb = 3;
#endif
group_nrowa = group_m;
if (group_transa & 1) group_nrowa = group_k;
group_nrowb = group_k;
if (group_transb & 1) group_nrowb = group_n;
info=-1;
if (group_ldc < group_m) info = 13;
if (group_ldb < group_nrowb) info = 10;
if (group_lda < group_nrowa) info = 8;
if (group_k < 0) info = 5;
if (group_n < 0) info = 4;
if (group_m < 0) info = 3;
if (group_transb < 0) info = 2;
if (group_transa < 0) info = 1;
}else if (order == CblasRowMajor) {
group_m = n_array[i];
group_n = m_array[i];
group_k = k_array[i];
group_lda = ldb_array[i];
group_ldb = lda_array[i];
group_ldc = ldc_array[i];
if (transb_array[i] == CblasNoTrans) group_transa = 0;
if (transb_array[i] == CblasTrans) group_transa = 1;
#ifndef COMPLEX
if (transb_array[i] == CblasConjNoTrans) group_transa = 0;
if (transb_array[i] == CblasConjTrans) group_transa = 1;
#else
if (transb_array[i] == CblasConjNoTrans) group_transa = 2;
if (transb_array[i] == CblasConjTrans) group_transa = 3;
#endif
if (transa_array[i] == CblasNoTrans) group_transb = 0;
if (transa_array[i] == CblasTrans) group_transb = 1;
#ifndef COMPLEX
if (transa_array[i] == CblasConjNoTrans) group_transb = 0;
if (transa_array[i] == CblasConjTrans) group_transb = 1;
#else
if (transa_array[i] == CblasConjNoTrans) group_transb = 2;
if (transa_array[i] == CblasConjTrans) group_transb = 3;
#endif
group_nrowa = group_m;
if (group_transa & 1) group_nrowa = group_k;
group_nrowb = group_k;
if (group_transb & 1) group_nrowb = group_n;
info=-1;
if (group_ldc < group_m) info = 13;
if (group_ldb < group_nrowb) info = 10;
if (group_lda < group_nrowa) info = 8;
if (group_k < 0) info = 5;
if (group_n < 0) info = 4;
if (group_m < 0) info = 3;
if (group_transb < 0) info = 2;
if (group_transa < 0) info = 1;
}
if (info >= 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
free(args_array);
return;
}
if (group_m == 0 || group_n == 0) continue;
group_mode=mode;
#if defined(SMP) || defined(SMALL_MATRIX_OPT)
MNK = (double) group_m * (double) group_n * (double) group_k;
#endif
#ifdef SMALL_MATRIX_OPT
if(MNK <= 100.0*100.0*100.0){
group_routine=NULL;
#if !defined(COMPLEX)
if(*(FLOAT *)(group_beta) == 0.0){
group_mode=mode | BLAS_SMALL_B0_OPT;
group_small_matrix_opt_routine=(void *)(gemm_small_kernel_b0[(group_transb<<2)|group_transa]);
}else{
group_mode=mode | BLAS_SMALL_OPT;
group_small_matrix_opt_routine=(void *)(gemm_small_kernel[(group_transb<<2)|group_transa]);
}
#else
if(((FLOAT *)(group_beta))[0] == 0.0 && ((FLOAT *)(group_beta))[1] == 0.0){
group_mode=mode | BLAS_SMALL_B0_OPT;
group_small_matrix_opt_routine=(void *)(zgemm_small_kernel_b0[(group_transb<<2)|group_transa]);
}else{
group_mode=mode | BLAS_SMALL_OPT;
group_small_matrix_opt_routine=(void *)(zgemm_small_kernel[(group_transb<<2)|group_transa]);
}
#endif
}else{
#endif
group_routine=(void*)(gemm[(group_transb<<2)|group_transa]);
#ifdef SMALL_MATRIX_OPT
}
#endif
for(j=0; j<group_size[i]; j++){
args_array[count].m=group_m;
args_array[count].n=group_n;
args_array[count].k=group_k;
args_array[count].lda=group_lda;
args_array[count].ldb=group_ldb;
args_array[count].ldc=group_ldc;
args_array[count].alpha=group_alpha;
args_array[count].beta=group_beta;
if (order == CblasColMajor) {
args_array[count].a=(a_array[matrix_idx+j]);
args_array[count].b=(b_array[matrix_idx+j]);
}else if(order == CblasRowMajor){
args_array[count].a=(b_array[matrix_idx+j]);
args_array[count].b=(a_array[matrix_idx+j]);
}
args_array[count].c=(c_array[matrix_idx+j]);
args_array[count].routine_mode=group_mode;
args_array[count].routine=group_routine;
#ifdef SMALL_MATRIX_OPT
args_array[count].routine=group_small_matrix_opt_routine;
#endif
count++;
}
}
if(count>0){
GEMM_BATCH_THREAD(args_array,count);
}
free(args_array);
}

View File

@ -405,6 +405,59 @@ XBLASOBJS += \
endif endif
###### BLAS small matrix optimization #####
ifeq ($(SMALL_MATRIX_OPT), 1)
SBLASOBJS += \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
DBLASOBJS += \
dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
CBLASOBJS += \
cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
ZBLASOBJS += \
zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) \
zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX)
endif
###### BLAS extensions ##### ###### BLAS extensions #####
SBLASOBJS += \ SBLASOBJS += \
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
@ -4137,3 +4190,376 @@ endif
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) $(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
###### BLAS small matrix optimization #####
ifndef DGEMM_SMALL_K_NN
DGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef DGEMM_SMALL_K_NT
DGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef DGEMM_SMALL_K_TN
DGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef DGEMM_SMALL_K_TT
DGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
ifndef DGEMM_SMALL_K_B0_NN
DGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
endif
ifndef DGEMM_SMALL_K_B0_NT
DGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
endif
ifndef DGEMM_SMALL_K_B0_TN
DGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
endif
ifndef DGEMM_SMALL_K_B0_TT
DGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
endif
$(KDIR)dgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
ifndef SGEMM_SMALL_K_NN
SGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SGEMM_SMALL_K_NT
SGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SGEMM_SMALL_K_TN
SGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SGEMM_SMALL_K_TT
SGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifndef SGEMM_SMALL_K_B0_NN
SGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_b0_nn.c
endif
ifndef SGEMM_SMALL_K_B0_NT
SGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_b0_nt.c
endif
ifndef SGEMM_SMALL_K_B0_TN
SGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_b0_tn.c
endif
ifndef SGEMM_SMALL_K_B0_TT
SGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_b0_tt.c
endif
$(KDIR)sgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifndef CGEMM_SMALL_K_NN
CGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef CGEMM_SMALL_K_NT
CGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef CGEMM_SMALL_K_TN
CGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef CGEMM_SMALL_K_TT
CGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)cgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)cgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)cgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)cgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)cgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)cgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)cgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)cgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
$(KDIR)cgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)cgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)cgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
$(KDIR)cgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)cgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)cgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)cgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
ifndef CGEMM_SMALL_K_B0_NN
CGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
endif
ifndef CGEMM_SMALL_K_B0_NT
CGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
endif
ifndef CGEMM_SMALL_K_B0_TN
CGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
endif
ifndef CGEMM_SMALL_K_B0_TT
CGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
endif
$(KDIR)cgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)cgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)cgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)cgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)cgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)cgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)cgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)cgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $@
$(KDIR)cgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)cgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)cgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCR $< -o $@
$(KDIR)cgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)cgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)cgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)cgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
ifndef ZGEMM_SMALL_K_NN
ZGEMM_SMALL_K_NN = ../generic/zgemm_small_matrix_kernel_nn.c
endif
ifndef ZGEMM_SMALL_K_NT
ZGEMM_SMALL_K_NT = ../generic/zgemm_small_matrix_kernel_nt.c
endif
ifndef ZGEMM_SMALL_K_TN
ZGEMM_SMALL_K_TN = ../generic/zgemm_small_matrix_kernel_tn.c
endif
ifndef ZGEMM_SMALL_K_TT
ZGEMM_SMALL_K_TT = ../generic/zgemm_small_matrix_kernel_tt.c
endif
$(KDIR)zgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)zgemm_small_kernel_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)zgemm_small_kernel_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)zgemm_small_kernel_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)zgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)zgemm_small_kernel_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)zgemm_small_kernel_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)zgemm_small_kernel_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
$(KDIR)zgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)zgemm_small_kernel_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)zgemm_small_kernel_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)zgemm_small_kernel_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
$(KDIR)zgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)zgemm_small_kernel_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)zgemm_small_kernel_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)zgemm_small_kernel_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
ifndef ZGEMM_SMALL_K_B0_NN
ZGEMM_SMALL_K_B0_NN = ../generic/zgemm_small_matrix_kernel_b0_nn.c
endif
ifndef ZGEMM_SMALL_K_B0_NT
ZGEMM_SMALL_K_B0_NT = ../generic/zgemm_small_matrix_kernel_b0_nt.c
endif
ifndef ZGEMM_SMALL_K_B0_TN
ZGEMM_SMALL_K_B0_TN = ../generic/zgemm_small_matrix_kernel_b0_tn.c
endif
ifndef ZGEMM_SMALL_K_B0_TT
ZGEMM_SMALL_K_B0_TT = ../generic/zgemm_small_matrix_kernel_b0_tt.c
endif
$(KDIR)zgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
$(KDIR)zgemm_small_kernel_b0_nr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNR $< -o $@
$(KDIR)zgemm_small_kernel_b0_rn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRN $< -o $@
$(KDIR)zgemm_small_kernel_b0_rr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $@
$(KDIR)zgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNT $< -o $@
$(KDIR)zgemm_small_kernel_b0_nc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
$(KDIR)zgemm_small_kernel_b0_rt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRT $< -o $@
$(KDIR)zgemm_small_kernel_b0_rc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $@
$(KDIR)zgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTN $< -o $@
$(KDIR)zgemm_small_kernel_b0_tr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTR $< -o $@
$(KDIR)zgemm_small_kernel_b0_cn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)zgemm_small_kernel_b0_cr$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCR $< -o $@
$(KDIR)zgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTT $< -o $@
$(KDIR)zgemm_small_kernel_b0_tc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DTC $< -o $@
$(KDIR)zgemm_small_kernel_b0_ct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCT $< -o $@
$(KDIR)zgemm_small_kernel_b0_cc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_SMALL_K_B0_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k+j*ldb];
}
C[i+j*ldc]=alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k*ldb+j];
}
C[i+j*ldc]=alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb,FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k+j*ldb];
}
C[i+j*ldc]=alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k*ldb+j];
}
C[i+j*ldc]=alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k*ldb+j];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k*ldb+j];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,74 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(NR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#endif
}
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,77 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(NC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#endif
}
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,77 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(TR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#endif
}
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,77 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(TC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#endif
}
C[j*2*ldc + 2*i] = alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,78 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
FLOAT tmp0, tmp1;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(NR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
+ A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RN)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
+A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#elif defined(RR)
real += (A[l*2*lda + 2*i]*B[j*2*ldb + 2*l]
-A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[l*2*lda + 2*i] * B[j*2*ldb + 2*l + 1]
- A[l*2*lda + 2*i + 1] * B[j*2*ldb + 2*l]);
#endif
}
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,82 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
FLOAT tmp0, tmp1;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(NT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(NC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
+ A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RT)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
+A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#elif defined(RC)
real += (A[l*2*lda + 2*i]*B[l*2*ldb + 2*j]
-A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[l*2*lda + 2*i] * B[l*2*ldb + 2*j + 1]
- A[l*2*lda + 2*i + 1] * B[l*2*ldb + 2*j]);
#endif
}
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,82 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
FLOAT tmp0, tmp1;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(TR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
+ A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CN)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
+A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#elif defined(CR)
real += (A[i*2*lda + 2*l]*B[j*2*ldb + 2*l]
-A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l + 1]);
imag+=(-A[i*2*lda + 2*l] * B[j*2*ldb + 2*l + 1]
- A[i*2*lda + 2*l + 1] * B[j*2*ldb + 2*l]);
#endif
}
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
}
}
return 0;
}

View File

@ -0,0 +1,82 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha0, FLOAT alpha1, FLOAT * B, BLASLONG ldb, FLOAT beta0, FLOAT beta1, FLOAT * C, BLASLONG ldc)
{
FLOAT real, imag;
FLOAT tmp0, tmp1;
int i, j, l;
for(i = 0; i < M; i++){
for(j = 0; j < N; j++){
real=0;
imag=0;
for(l = 0; l < K; l++){
#if defined(TT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(TC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
+ A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CT)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
+A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#elif defined(CC)
real += (A[i*2*lda + 2*l]*B[l*2*ldb + 2*j]
-A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j + 1]);
imag+=(-A[i*2*lda + 2*l] * B[l*2*ldb + 2*j + 1]
- A[i*2*lda + 2*l + 1] * B[l*2*ldb + 2*j]);
#endif
}
tmp0 = beta0*C[j*2*ldc + 2*i] - beta1*C[j*2*ldc+ 2*i + 1];
tmp1 = beta0*C[j*2*ldc+ 2*i + 1] + beta1*C[j*2*ldc + 2*i];
C[j*2*ldc + 2*i] =tmp0+ alpha0*real - alpha1*imag;
C[j*2*ldc+ 2*i + 1] = tmp1+ alpha0*imag + real*alpha1;
}
}
return 0;
}