diff --git a/driver/level3/Makefile b/driver/level3/Makefile index b8465d4ed..c30483842 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -37,7 +37,7 @@ SBLASOBJS += \ ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \ ssyr2k_UN.$(SUFFIX) ssyr2k_UT.$(SUFFIX) ssyr2k_LN.$(SUFFIX) ssyr2k_LT.$(SUFFIX) \ ssyrk_kernel_U.$(SUFFIX) ssyrk_kernel_L.$(SUFFIX) \ - ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) + ssyr2k_kernel_U.$(SUFFIX) ssyr2k_kernel_L.$(SUFFIX) sgemm_batch_thread.$(SUFFIX) DBLASOBJS += \ dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX) \ @@ -53,7 +53,7 @@ DBLASOBJS += \ dsyrk_UN.$(SUFFIX) dsyrk_UT.$(SUFFIX) dsyrk_LN.$(SUFFIX) dsyrk_LT.$(SUFFIX) \ dsyr2k_UN.$(SUFFIX) dsyr2k_UT.$(SUFFIX) dsyr2k_LN.$(SUFFIX) dsyr2k_LT.$(SUFFIX) \ dsyrk_kernel_U.$(SUFFIX) dsyrk_kernel_L.$(SUFFIX) \ - dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) + dsyr2k_kernel_U.$(SUFFIX) dsyr2k_kernel_L.$(SUFFIX) dgemm_batch_thread.$(SUFFIX) QBLASOBJS += \ qgemm_nn.$(SUFFIX) qgemm_nt.$(SUFFIX) qgemm_tn.$(SUFFIX) qgemm_tt.$(SUFFIX) \ @@ -103,7 +103,7 @@ CBLASOBJS += \ cherk_kernel_LN.$(SUFFIX) cherk_kernel_LC.$(SUFFIX) \ csyr2k_kernel_U.$(SUFFIX) csyr2k_kernel_L.$(SUFFIX) \ cher2k_kernel_UN.$(SUFFIX) cher2k_kernel_UC.$(SUFFIX) \ - cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) + cher2k_kernel_LN.$(SUFFIX) cher2k_kernel_LC.$(SUFFIX) cgemm_batch_thread.$(SUFFIX) ZBLASOBJS += \ zgemm_nn.$(SUFFIX) zgemm_cn.$(SUFFIX) zgemm_tn.$(SUFFIX) zgemm_nc.$(SUFFIX) \ @@ -137,7 +137,7 @@ ZBLASOBJS += \ zherk_kernel_LN.$(SUFFIX) zherk_kernel_LC.$(SUFFIX) \ zsyr2k_kernel_U.$(SUFFIX) zsyr2k_kernel_L.$(SUFFIX) \ zher2k_kernel_UN.$(SUFFIX) zher2k_kernel_UC.$(SUFFIX) \ - zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) + zher2k_kernel_LN.$(SUFFIX) zher2k_kernel_LC.$(SUFFIX) zgemm_batch_thread.$(SUFFIX) XBLASOBJS += \ @@ -2942,6 +2942,21 @@ gemm_thread_variable.$(PSUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) +sbgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +sgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgemm_batch_thread.$(SUFFIX) : gemm_batch_thread.c ../../common.h + $(CC) -c $(CFLAGS) $< -o $(@F) + sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/level3/gemm_batch_thread.c b/driver/level3/gemm_batch_thread.c new file mode 100644 index 000000000..45d6977ba --- /dev/null +++ b/driver/level3/gemm_batch_thread.c @@ -0,0 +1,156 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +void openblas_warning(int verbose, const char * msg); + +#ifdef SMALL_MATRIX_OPT +static int inner_small_matrix_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ + int routine_mode; +#ifndef COMPLEX + int (*gemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG); + int (*gemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG); +#else + int (*zgemm_small_kernel)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG); + int (*zgemm_small_kernel_b0)(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT , FLOAT, FLOAT *, BLASLONG, FLOAT *, BLASLONG); + FLOAT alpha[2], beta[2]; +#endif + routine_mode=args->routine_mode; + if((routine_mode & BLAS_SMALL_B0_OPT) == BLAS_SMALL_B0_OPT){ +#ifndef COMPLEX + gemm_small_kernel_b0=args->routine; + gemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, args->c, args->ldc); +#else + zgemm_small_kernel_b0=args->routine; + alpha[0] = *((FLOAT *)args -> alpha + 0); + alpha[1] = *((FLOAT *)args -> alpha + 1); + zgemm_small_kernel_b0(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, args->c, args->ldc); +#endif + return(0); + }else if(routine_mode & BLAS_SMALL_OPT){ +#ifndef COMPLEX + gemm_small_kernel=args->routine; + gemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, *(FLOAT *)(args->alpha), args->b, args->ldb, *(FLOAT *)(args->beta), args->c, args->ldc); +#else + zgemm_small_kernel=args->routine; + alpha[0] = *((FLOAT *)args -> alpha + 0); + alpha[1] = *((FLOAT *)args -> alpha + 1); + beta[0] = *((FLOAT *)args -> beta + 0); + beta[1] = *((FLOAT *)args -> beta + 1); + zgemm_small_kernel(args->m, args->n, args->k, args->a, args->lda, alpha[0], alpha[1], args->b, args->ldb, beta[0], beta[1], args->c, args->ldc); +#endif + return(0); + } + return(1); +} +#endif + +int CNAME(blas_arg_t * args_array, BLASLONG nums){ + XFLOAT *buffer; + XFLOAT *sa, *sb; + int nthreads=1; + int (*routine)(blas_arg_t *, void *, void *, XFLOAT *, XFLOAT *, BLASLONG); + int i=0, /*j,*/ current_nums; + +#ifdef SMP + blas_queue_t * queue=NULL; +#endif + + if(nums <=0 ) return 0; + + buffer = (XFLOAT *)blas_memory_alloc(0); + sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A); + sb = (XFLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + +#ifdef SMP + nthreads=num_cpu_avail(3); + + if(nthreads==1){ + +#endif + //single thread + for(i=0; inthreads)? nthreads: (nums-i); + + queue[i].sa=sa; + queue[i].sb=sb; + queue[i+current_nums-1].next=NULL; + + exec_blas(current_nums, &queue[i]); + } + free(queue); + } +#endif + blas_memory_free(buffer); + return 0; +}