Merge pull request #2686 from RajalakshmiSR/p10_shgemm
powerpc: Optimized SHGEMM kernel for POWER10
This commit is contained in:
commit
c2467c9619
|
@ -39,24 +39,24 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12;
|
||||||
FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16;
|
||||||
|
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -39,30 +39,30 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
|
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||||
|
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
|
|
|
@ -39,22 +39,22 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2;
|
IFLOAT *aoffset1, *aoffset2;
|
||||||
FLOAT *boffset;
|
IFLOAT *boffset;
|
||||||
|
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -39,32 +39,32 @@
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){
|
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
|
||||||
|
|
||||||
BLASLONG i, j;
|
BLASLONG i, j;
|
||||||
|
|
||||||
FLOAT *aoffset;
|
IFLOAT *aoffset;
|
||||||
FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
|
||||||
FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
|
||||||
|
|
||||||
FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4;
|
||||||
|
|
||||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
IFLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||||
FLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
IFLOAT ctemp09, ctemp10, ctemp11, ctemp12;
|
||||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
IFLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||||
FLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
IFLOAT ctemp17, ctemp18, ctemp19, ctemp20;
|
||||||
FLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
IFLOAT ctemp21, ctemp22, ctemp23, ctemp24;
|
||||||
FLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
IFLOAT ctemp25, ctemp26, ctemp27, ctemp28;
|
||||||
FLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
IFLOAT ctemp29, ctemp30, ctemp31, ctemp32;
|
||||||
FLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
IFLOAT ctemp33, ctemp34, ctemp35, ctemp36;
|
||||||
FLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
IFLOAT ctemp37, ctemp38, ctemp39, ctemp40;
|
||||||
FLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
IFLOAT ctemp41, ctemp42, ctemp43, ctemp44;
|
||||||
FLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
IFLOAT ctemp45, ctemp46, ctemp47, ctemp48;
|
||||||
FLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
IFLOAT ctemp49, ctemp50, ctemp51, ctemp52;
|
||||||
FLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
IFLOAT ctemp53, ctemp54, ctemp55, ctemp56;
|
||||||
FLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
IFLOAT ctemp57, ctemp58, ctemp59, ctemp60;
|
||||||
FLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
IFLOAT ctemp61, ctemp62, ctemp63, ctemp64;
|
||||||
|
|
||||||
aoffset = a;
|
aoffset = a;
|
||||||
boffset = b;
|
boffset = b;
|
||||||
|
|
|
@ -7,6 +7,17 @@ else
|
||||||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||||
|
|
||||||
|
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||||
|
SHGEMMKERNEL = shgemm_kernel_power10.c
|
||||||
|
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
|
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRMMKERNEL = sgemm_kernel_power10.c
|
STRMMKERNEL = sgemm_kernel_power10.c
|
||||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||||
CTRMMKERNEL = cgemm_kernel_power10.S
|
CTRMMKERNEL = cgemm_kernel_power10.S
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
13
param.h
13
param.h
|
@ -2297,6 +2297,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(POWER10)
|
||||||
|
#undef SHGEMM_DEFAULT_UNROLL_N
|
||||||
|
#undef SHGEMM_DEFAULT_UNROLL_M
|
||||||
|
#undef SHGEMM_DEFAULT_P
|
||||||
|
#undef SHGEMM_DEFAULT_R
|
||||||
|
#undef SHGEMM_DEFAULT_Q
|
||||||
|
#define SHGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
#define SHGEMM_DEFAULT_UNROLL_N 8
|
||||||
|
#define SHGEMM_DEFAULT_P 832
|
||||||
|
#define SHGEMM_DEFAULT_Q 1026
|
||||||
|
#define SHGEMM_DEFAULT_R 4096
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(SPARC) && defined(V7)
|
#if defined(SPARC) && defined(V7)
|
||||||
|
|
||||||
#define SNUMOPT 4
|
#define SNUMOPT 4
|
||||||
|
|
Loading…
Reference in New Issue