Add small marix optimization kernel interface.

make SMALL_MATRIX_OPT=1
This commit is contained in:
Xianyi Zhang 2020-04-28 19:01:36 +08:00
parent 57549f5c92
commit aae6af94bb
11 changed files with 340 additions and 1 deletions

View File

@ -209,6 +209,11 @@ else
ONLY_CBLAS = 0
endif
#For small matrix optimization
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT
endif
# This operation is expensive, so execution should be once.
ifndef GOTOBLAS_MAKEFILE
export GOTOBLAS_MAKEFILE = 1

View File

@ -157,6 +157,12 @@
#define DIMATCOPY_K_RT dimatcopy_k_rt
#define DGEADD_K dgeadd_k
#define DGEMM_SMALL_KERNEL_NN dgemm_small_kernel_nn
#define DGEMM_SMALL_KERNEL_NT dgemm_small_kernel_nt
#define DGEMM_SMALL_KERNEL_TN dgemm_small_kernel_tn
#define DGEMM_SMALL_KERNEL_TT dgemm_small_kernel_tt
#else
#define DAMAX_K gotoblas -> damax_k

View File

@ -515,6 +515,18 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble
int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
#endif
#ifdef SMALL_MATRIX_OPT
int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc);
int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc);
#endif
int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);

View File

@ -644,6 +644,11 @@
#define GEADD_K DGEADD_K
#define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT
#elif defined(HALF)
#define AMAX_K SAMAX_K
@ -923,6 +928,11 @@
#define GEADD_K SGEADD_K
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
#endif
#else
@ -1228,6 +1238,12 @@
#define IMATCOPY_K_RT SIMATCOPY_K_RT
#define GEADD_K SGEADD_K
#define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN
#define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT
#define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN
#define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT
#endif
#else
#ifdef XDOUBLE

View File

@ -160,6 +160,11 @@
#define SGEADD_K sgeadd_k
#define SGEMM_SMALL_KERNEL_NN sgemm_small_kernel_nn
#define SGEMM_SMALL_KERNEL_NT sgemm_small_kernel_nt
#define SGEMM_SMALL_KERNEL_TN sgemm_small_kernel_tn
#define SGEMM_SMALL_KERNEL_TT sgemm_small_kernel_tt
#else
#define SAMAX_K gotoblas -> samax_k

View File

@ -103,6 +103,18 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, B
#endif
};
#ifdef SMALL_MATRIX_OPT
//Only support s/dgemm small matrix optimiztion so far.
static int (*gemm_small_kernel[])(BLASLONG, BLASLONG, BLASLONG, FLOAT *, BLASLONG, FLOAT ,FLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
#ifndef GEMM3M
#ifndef COMPLEX
GEMM_SMALL_KERNEL_NN, GEMM_SMALL_KERNEL_TN, NULL, NULL,
GEMM_SMALL_KERNEL_NT, GEMM_SMALL_KERNEL_TT, NULL, NULL,
#endif
#endif
};
#endif
#ifndef CBLAS
void NAME(char *TRANSA, char *TRANSB,
@ -411,6 +423,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
FUNCTION_PROFILE_START();
MNK = (double) args.m * (double) args.n * (double) args.k;
#ifdef SMALL_MATRIX_OPT
#if !defined(COMPLEX)
//need to tune small matrices cases.
if(MNK <= 100.0*100.0*100.0){
(gemm_small_kernel[(transb << 2) | transa])(args.m, args.n, args.k, args.a, args.lda, *(FLOAT *)(args.alpha), args.b,
args.ldb, *(FLOAT *)(args.beta), args.c, args.ldc);
return;
}
#endif
#endif
buffer = (XFLOAT *)blas_memory_alloc(0);
sa = (XFLOAT *)((BLASLONG)buffer +GEMM_OFFSET_A);
@ -420,7 +446,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (SMP_THRESHOLD_MIN * (double) GEMM_MULTITHREAD_THRESHOLD) )
args.nthreads = 1;
else

View File

@ -371,6 +371,19 @@ XBLASOBJS += \
endif
###### BLAS small matrix optimization #####
ifeq ($(SMALL_MATRIX_OPT), 1)
SBLASOBJS += \
sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
DBLASOBJS += \
dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX)
endif
###### BLAS extensions #####
SBLASOBJS += \
somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \
@ -4075,3 +4088,63 @@ endif
$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K)
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@
###### BLAS small matrix optimization #####
ifndef DGEMM_SAMLL_K_NN
DGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef DGEMM_SAMLL_K_NT
DGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef DGEMM_SAMLL_K_TN
DGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef DGEMM_SAMLL_K_TT
DGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)dgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_NT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TN)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)dgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMM_SAMLL_K_TT)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
ifndef SGEMM_SAMLL_K_NN
SGEMM_SAMLL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
endif
ifndef SGEMM_SAMLL_K_NT
SGEMM_SAMLL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
endif
ifndef SGEMM_SAMLL_K_TN
SGEMM_SAMLL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
endif
ifndef SGEMM_SAMLL_K_TT
SGEMM_SAMLL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
endif
$(KDIR)sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_NT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TN)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)sgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_SAMLL_K_TT)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i+k*lda] * B[k*ldb+j];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k+j*ldb];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}

View File

@ -0,0 +1,49 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT * A, BLASLONG lda, FLOAT alpha, FLOAT * B, BLASLONG ldb, FLOAT beta, FLOAT * C, BLASLONG ldc)
{
//naive implemtation
//Column major
BLASLONG i,j,k;
FLOAT result=0.0;
for(i=0; i<M; i++){
for(j=0; j<N; j++){
result=0.0;
for(k=0; k<K; k++){
result += A[i*lda+k] * B[k*ldb+j];
}
C[i+j*ldc]=C[i+j*ldc] * beta + alpha * result;
}
}
return 0;
}