From ff40a4e7262d0e401c8c57032b0caf1058043d94 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Tue, 10 Mar 2020 14:22:18 +0800 Subject: [PATCH 01/19] Add benchmark for SPMV --- benchmark/Makefile | 740 ++++++--------------------------------------- benchmark/spmv.c | 219 ++++++++++++++ interface/Makefile | 4 +- 3 files changed, 321 insertions(+), 642 deletions(-) create mode 100644 benchmark/spmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 2db873e95..ccddb55e8 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + - -# Atlas Ubuntu +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,16 +56,11 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -73,36 +68,26 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -110,36 +95,26 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -147,37 +122,27 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ - srot.mkl drot.mkl csrot.mkl zdrot.mkl \ - srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -185,36 +150,27 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ - sspr.goto dspr.goto \ - sspr2.goto dspr2.goto \ - ssyr.goto dsyr.goto \ - ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto csrot.goto zdrot.goto \ - srotm.goto drotm.goto \ + srot.goto drot.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -222,14 +178,11 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ - chbmv.goto zhbmv.goto \ - chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ - strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -240,7 +193,6 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ smax.goto dmax.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -248,16 +200,10 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ - sspr.acml dspr.acml \ - sspr2.acml dspr2.acml \ - ssyr.acml dsyr.acml \ - ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ - srot.acml drot.acml csrot.acml zdrot.acml \ - srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -265,36 +211,26 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ - chbmv.acml zhbmv.acml \ - chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ - strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ + sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml \ - saxpby.acml daxpby.acml caxpby.acml zaxpby.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ - sspr.atlas dspr.atlas \ - sspr2.atlas dspr2.atlas \ - ssyr.atlas dsyr.atlas \ - ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -302,39 +238,29 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ - chbmv.atlas zhbmv.atlas \ - chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ - strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ + sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ - saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ - sspr.mkl dspr.mkl \ - sspr2.mkl dspr2.mkl \ - ssyr.mkl dsyr.mkl \ - ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ - srot.atlas drot.atlas csrot.atlas zdrot.atlas \ - srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -342,20 +268,16 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ - chbmv.mkl zhbmv.mkl \ - chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ - strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ + sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ - saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl @@ -373,16 +295,10 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ - sspr.veclib dspr.veclib \ - sspr2.veclib dspr2.veclib \ - ssyr.veclib dsyr.veclib \ - ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ - srot.veclib drot.veclib csrot.veclib zdrot.veclib \ - srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -390,20 +306,16 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ - chbmv.veclib zhbmv.veclib \ - chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ - strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ + sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ - saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib goto_3m :: cgemm3m.goto zgemm3m.goto @@ -872,130 +784,6 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Ssyr #################################################### -ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr.acml : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.atlas : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.mkl : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr.veclib : ssyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr #################################################### -dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr.acml : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.atlas : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.mkl : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr.veclib : dsyr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr #################################################### -sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr.acml : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.atlas : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.mkl : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr.veclib : sspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr #################################################### -dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr.acml : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.atlas : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.mkl : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr.veclib : dspr.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Sspr2 #################################################### -sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspr2.acml : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.atlas : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.mkl : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspr2.veclib : sspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspr2 #################################################### -dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspr2.acml : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.atlas : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.mkl : dspr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspr2.veclib : dspr2.$(SUFFIX) - -##################################### Ssyr2 #################################################### -ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ssyr2.acml : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.atlas : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.mkl : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ssyr2.veclib : ssyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Dsyr2 #################################################### -dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dsyr2.acml : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.atlas : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.mkl : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dsyr2.veclib : dsyr2.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -1231,6 +1019,71 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspmv.acml : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.atlas : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.mkl : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.veclib : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspmv.acml : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.atlas : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.mkl : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.veclib : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cspmv #################################################### +cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cspmv.acml : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.atlas : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.mkl : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.veclib : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zspmv #################################################### +zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zspmv.acml : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.atlas : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.mkl : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.veclib : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1297,138 +1150,6 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Strmv #################################################### -strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strmv.acml : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.atlas : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.mkl : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strmv.veclib : strmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrmv #################################################### -dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrmv.acml : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.atlas : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.mkl : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrmv.veclib : dtrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrmv #################################################### - -ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrmv.acml : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.atlas : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.mkl : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrmv.veclib : ctrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrmv #################################################### - -ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrmv.acml : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.atlas : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.mkl : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrmv.veclib : ztrmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Strsv #################################################### -strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -strsv.acml : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.atlas : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.mkl : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -strsv.veclib : strsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dtrsv #################################################### -dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dtrsv.acml : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.atlas : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.mkl : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dtrsv.veclib : dtrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ctrsv #################################################### - -ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ctrsv.acml : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.atlas : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.mkl : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ctrsv.veclib : ctrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Ztrsv #################################################### - -ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -ztrsv.acml : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.atlas : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.mkl : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -ztrsv.veclib : ztrsv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1788,70 +1509,7 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chbmv #################################################### -chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chbmv.acml : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.atlas : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.mkl : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chbmv.veclib : chbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhbmv #################################################### - -zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhbmv.acml : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.atlas : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.mkl : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhbmv.veclib : zhbmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Chpmv #################################################### - -chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -chpmv.acml : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.atlas : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.mkl : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -chpmv.veclib : chpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Zhpmv #################################################### - -zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zhpmv.acml : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.atlas : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.mkl : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zhpmv.veclib : zhpmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1948,69 +1606,6 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### csrot #################################################### -csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -csrot.acml : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.atlas : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.mkl : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -csrot.veclib : csrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### zdrot #################################################### -zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zdrot.acml : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.atlas : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.mkl : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zdrot.veclib : zdrot.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### srotm #################################################### -srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -srotm.acml : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.atlas : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.mkl : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -srotm.veclib : srotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### drotm #################################################### -drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -drotm.acml : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.atlas : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.mkl : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -drotm.veclib : drotm.$(SUFFIX) - $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -2078,72 +1673,7 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Saxpby #################################################### -saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -saxpby.acml : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.atlas : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.mkl : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -saxpby.veclib : saxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Daxpby #################################################### -daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -daxpby.acml : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.atlas : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.mkl : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -daxpby.veclib : daxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Caxpby #################################################### - -caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -caxpby.acml : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.atlas : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.mkl : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -caxpby.veclib : caxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zaxpby #################################################### - -zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zaxpby.acml : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.atlas : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.mkl : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zaxpby.veclib : zaxpby.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2533,7 +2063,7 @@ ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) ############################################## IDMAX ############################################## idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - + ############################################## ISAMIN ############################################## isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2709,30 +2239,6 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -ssyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr.$(SUFFIX) : syr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr.$(SUFFIX) : spr.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dspr2.$(SUFFIX) : spr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ssyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dsyr2.$(SUFFIX) : syr2.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2775,6 +2281,18 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zspmv.$(SUFFIX) : spmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2787,30 +2305,6 @@ cgemv.$(SUFFIX) : gemv.c zgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -strmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrmv.$(SUFFIX) : trmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -strsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dtrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ctrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -ztrsv.$(SUFFIX) : trsv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - sger.$(SUFFIX) : ger.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2878,18 +2372,6 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -chbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhbmv.$(SUFFIX) : hbmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -chpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zhpmv.$(SUFFIX) : hpmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2922,18 +2404,6 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -saxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpby.$(SUFFIX) : axpby.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -3003,17 +2473,7 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -csrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zdrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drotm.$(SUFFIX) : rotm.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ @@ -3096,7 +2556,6 @@ camin.$(SUFFIX) : amin.c zamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - smin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -3104,6 +2563,7 @@ dmin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + snrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/spmv.c b/benchmark/spmv.c new file mode 100644 index 000000000..12a33e298 --- /dev/null +++ b/benchmark/spmv.c @@ -0,0 +1,219 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef SPMV + + +#ifndef COMPLEX + +#ifdef DOUBLE +#define SPMV BLASFUNC(dspmv) +#else +#define SPMV BLASFUNC(sspmv) +#endif + +#else + +#ifdef DOUBLE +#define SPMV BLASFUNC(zspmv) +#else +#define SPMV BLASFUNC(cspmv) +#endif + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char uplo='L'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6dx%d : ", (int)m,(int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + + for (l=0; l Date: Tue, 10 Mar 2020 14:32:18 +0800 Subject: [PATCH 02/19] Modify Makefile in Benchmark --- benchmark/Makefile | 794 ++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 709 insertions(+), 85 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index ccddb55e8..fdc57819f 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -12,9 +12,9 @@ include $(TOPDIR)/Makefile.system # ACML 6.1 custom ACML=/home/saar/acml6.1/gfortran64_mp/lib LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm - -# Atlas Ubuntu + +# Atlas Ubuntu #ATLAS=/usr/lib/atlas-base #LIBATLAS = -fopenmp $(ATLAS)/liblapack_atlas.a $(ATLAS)/libptcblas.a $(ATLAS)/libptf77blas.a $(ATLAS)/libatlas.a -lgfortran -lm @@ -56,11 +56,16 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -68,26 +73,37 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto csymv.goto zsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ - ssymm.goto dsymm.goto csymm.goto zsymm.goto + ssymm.goto dsymm.goto csymm.goto zsymm.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -95,26 +111,37 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -122,27 +149,38 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl \ + srot.mkl drot.mkl csrot.mkl zdrot.mkl \ + srotm.mkl drotm.mkl \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -150,27 +188,37 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl else goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ strmm.goto dtrmm.goto ctrmm.goto ztrmm.goto \ strsm.goto dtrsm.goto ctrsm.goto ztrsm.goto \ + sspr.goto dspr.goto \ + sspr2.goto dspr2.goto \ + ssyr.goto dsyr.goto \ + ssyr2.goto dsyr2.goto \ ssyrk.goto dsyrk.goto csyrk.goto zsyrk.goto \ ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ sger.goto dger.goto cger.goto zger.goto \ sdot.goto ddot.goto cdot.goto zdot.goto \ - srot.goto drot.goto \ + srot.goto drot.goto csrot.goto zdrot.goto \ + srotm.goto drotm.goto \ saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ scopy.goto dcopy.goto ccopy.goto zcopy.goto \ sswap.goto dswap.goto cswap.goto zswap.goto \ @@ -178,11 +226,15 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sasum.goto dasum.goto casum.goto zasum.goto \ ssymv.goto dsymv.goto \ chemv.goto zhemv.goto \ + chbmv.goto zhbmv.goto \ + chpmv.goto zhpmv.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ + sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ isamax.goto idamax.goto icamax.goto izamax.goto \ @@ -193,6 +245,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ smax.goto dmax.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ @@ -200,10 +253,16 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemm.acml dgemm.acml cgemm.acml zgemm.acml \ strmm.acml dtrmm.acml ctrmm.acml ztrmm.acml \ strsm.acml dtrsm.acml ctrsm.acml ztrsm.acml \ + sspr.acml dspr.acml \ + sspr2.acml dspr2.acml \ + ssyr.acml dsyr.acml \ + ssyr2.acml dsyr2.acml \ ssyrk.acml dsyrk.acml csyrk.acml zsyrk.acml \ ssyr2k.acml dsyr2k.acml csyr2k.acml zsyr2k.acml \ sger.acml dger.acml cger.acml zger.acml \ sdot.acml ddot.acml \ + srot.acml drot.acml csrot.acml zdrot.acml \ + srotm.acml drotm.acml \ saxpy.acml daxpy.acml caxpy.acml zaxpy.acml \ scopy.acml dcopy.acml ccopy.acml zcopy.acml \ sswap.acml dswap.acml cswap.acml zswap.acml \ @@ -211,26 +270,37 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sasum.acml dasum.acml casum.acml zasum.acml \ ssymv.acml dsymv.acml csymv.acml zsymv.acml \ chemv.acml zhemv.acml \ + chbmv.acml zhbmv.acml \ + chpmv.acml zhpmv.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ + strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ sgetri.acml dgetri.acml cgetri.acml zgetri.acml \ spotrf.acml dpotrf.acml cpotrf.acml zpotrf.acml \ - ssymm.acml dsymm.acml csymm.acml zsymm.acml + ssymm.acml dsymm.acml csymm.acml zsymm.acml \ + saxpby.acml daxpby.acml caxpby.acml zaxpby.acml atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ scholesky.atlas dcholesky.atlas ccholesky.atlas zcholesky.atlas \ sgemm.atlas dgemm.atlas cgemm.atlas zgemm.atlas \ strmm.atlas dtrmm.atlas ctrmm.atlas ztrmm.atlas \ strsm.atlas dtrsm.atlas ctrsm.atlas ztrsm.atlas \ + sspr.atlas dspr.atlas \ + sspr2.atlas dspr2.atlas \ + ssyr.atlas dsyr.atlas \ + ssyr2.atlas dsyr2.atlas \ ssyrk.atlas dsyrk.atlas csyrk.atlas zsyrk.atlas \ ssyr2k.atlas dsyr2k.atlas csyr2k.atlas zsyr2k.atlas \ sger.atlas dger.atlas cger.atlas zger.atlas\ sdot.atlas ddot.atlas \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.atlas daxpy.atlas caxpy.atlas zaxpy.atlas \ scopy.atlas dcopy.atlas ccopy.atlas zcopy.atlas \ sswap.atlas dswap.atlas cswap.atlas zswap.atlas \ @@ -238,29 +308,40 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sasum.atlas dasum.atlas casum.atlas zasum.atlas \ ssymv.atlas dsymv.atlas csymv.atlas zsymv.atlas \ chemv.atlas zhemv.atlas \ + chbmv.atlas zhbmv.atlas \ + chpmv.atlas zhpmv.atlas \ chemm.acml zhemm.acml \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto + snrm2.atlas dnrm2.atlas scnrm2.atlas dznrm2.atlas \ + saxpby.atlas daxpby.atlas caxpby.atlas zaxpby.atlas mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ sgemm.mkl dgemm.mkl cgemm.mkl zgemm.mkl \ strmm.mkl dtrmm.mkl ctrmm.mkl ztrmm.mkl \ strsm.mkl dtrsm.mkl ctrsm.mkl ztrsm.mkl \ + sspr.mkl dspr.mkl \ + sspr2.mkl dspr2.mkl \ + ssyr.mkl dsyr.mkl \ + ssyr2.mkl dsyr2.mkl \ ssyrk.mkl dsyrk.mkl csyrk.mkl zsyrk.mkl \ ssyr2k.mkl dsyr2k.mkl csyr2k.mkl zsyr2k.mkl \ sger.mkl dger.mkl cger.mkl zger.mkl \ sdot.mkl ddot.mkl cdot.mkl zdot.mkl \ + srot.atlas drot.atlas csrot.atlas zdrot.atlas \ + srotm.atlas drotm.atlas \ saxpy.mkl daxpy.mkl caxpy.mkl zaxpy.mkl \ scopy.mkl dcopy.mkl ccopy.mkl zcopy.mkl \ sswap.mkl dswap.mkl cswap.mkl zswap.mkl \ @@ -268,16 +349,21 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sasum.mkl dasum.mkl casum.mkl zasum.mkl \ ssymv.mkl dsymv.mkl csymv.mkl zsymv.mkl \ chemv.mkl zhemv.mkl \ + chbmv.mkl zhbmv.mkl \ + chpmv.mkl zhpmv.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ + strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ sgetri.mkl dgetri.mkl cgetri.mkl zgetri.mkl \ spotrf.mkl dpotrf.mkl cpotrf.mkl zpotrf.mkl \ - ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl + ssymm.mkl dsymm.mkl csymm.mkl zsymm.mkl \ + saxpby.mkl daxpby.mkl caxpby.mkl zaxpby.mkl @@ -295,10 +381,16 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemm.veclib dgemm.veclib cgemm.veclib zgemm.veclib \ strmm.veclib dtrmm.veclib ctrmm.veclib ztrmm.veclib \ strsm.veclib dtrsm.veclib ctrsm.veclib ztrsm.veclib \ + sspr.veclib dspr.veclib \ + sspr2.veclib dspr2.veclib \ + ssyr.veclib dsyr.veclib \ + ssyr2.veclib dsyr2.veclib \ ssyrk.veclib dsyrk.veclib csyrk.veclib zsyrk.veclib \ ssyr2k.veclib dsyr2k.veclib csyr2k.veclib zsyr2k.veclib \ sger.veclib dger.veclib cger.veclib zger.veclib \ sdot.veclib ddot.veclib cdot.veclib zdot.veclib \ + srot.veclib drot.veclib csrot.veclib zdrot.veclib \ + srotm.veclib drotm.veclib \ saxpy.veclib daxpy.veclib caxpy.veclib zaxpy.veclib \ scopy.veclib dcopy.veclib ccopy.veclib zcopy.veclib \ sswap.veclib dswap.veclib cswap.veclib zswap.veclib \ @@ -306,16 +398,21 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sasum.veclib dasum.veclib casum.veclib zasum.veclib \ ssymv.veclib dsymv.veclib csymv.veclib zsymv.veclib \ chemv.veclib zhemv.veclib \ + chbmv.veclib zhbmv.veclib \ + chpmv.veclib zhpmv.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ + strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ sgetri.veclib dgetri.veclib cgetri.veclib zgetri.veclib \ spotrf.veclib dpotrf.veclib cpotrf.veclib zpotrf.veclib \ - ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib + ssymm.veclib dsymm.veclib csymm.veclib zsymm.veclib \ + saxpby.veclib daxpby.veclib caxpby.veclib zaxpby.veclib goto_3m :: cgemm3m.goto zgemm3m.goto @@ -784,6 +881,130 @@ ztrsm.veclib : ztrsm.$(SUFFIX) ztrsm.essl : ztrsm.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Ssyr #################################################### +ssyr.goto : ssyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr.acml : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.atlas : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.mkl : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr.veclib : ssyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr #################################################### +dsyr.goto : dsyr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr.acml : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.atlas : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.mkl : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr.veclib : dsyr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr #################################################### +sspr.goto : sspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr.acml : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.atlas : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.mkl : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr.veclib : sspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr #################################################### +dspr.goto : dspr.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr.acml : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.atlas : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.mkl : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr.veclib : dspr.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Sspr2 #################################################### +sspr2.goto : sspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspr2.acml : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.atlas : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.mkl : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspr2.veclib : sspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspr2 #################################################### +dspr2.goto : dspr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspr2.acml : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.atlas : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.mkl : dspr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspr2.veclib : dspr2.$(SUFFIX) + +##################################### Ssyr2 #################################################### +ssyr2.goto : ssyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ssyr2.acml : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.atlas : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.mkl : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ssyr2.veclib : ssyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Dsyr2 #################################################### +dsyr2.goto : dsyr2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dsyr2.acml : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.atlas : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.mkl : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dsyr2.veclib : dsyr2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Ssyrk #################################################### ssyrk.goto : ssyrk.$(SUFFIX) ../$(LIBNAME) @@ -1019,71 +1240,6 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Sspmv #################################################### -sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -sspmv.acml : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspmv.atlas : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspmv.mkl : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspmv.veclib : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Dspmv #################################################### -dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -dspmv.acml : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspmv.atlas : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspmv.mkl : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspmv.veclib : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Cspmv #################################################### -cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cspmv.acml : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cspmv.atlas : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cspmv.mkl : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cspmv.veclib : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zspmv #################################################### -zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zspmv.acml : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zspmv.atlas : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zspmv.mkl : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zspmv.veclib : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - - ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1150,6 +1306,202 @@ zgemv.mkl : zgemv.$(SUFFIX) zgemv.veclib : zgemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Sspmv #################################################### +sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +sspmv.acml : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.atlas : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.mkl : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +sspmv.veclib : sspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dspmv #################################################### +dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dspmv.acml : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.atlas : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.mkl : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dspmv.veclib : dspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cspmv #################################################### +cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cspmv.acml : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.atlas : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.mkl : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cspmv.veclib : cspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zspmv #################################################### +zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zspmv.acml : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.atlas : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.mkl : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zspmv.veclib : zspmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strmv #################################################### +strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strmv.acml : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.atlas : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.mkl : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strmv.veclib : strmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrmv #################################################### +dtrmv.goto : dtrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrmv.acml : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.atlas : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.mkl : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrmv.veclib : dtrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrmv #################################################### + +ctrmv.goto : ctrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrmv.acml : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.atlas : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.mkl : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrmv.veclib : ctrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrmv #################################################### + +ztrmv.goto : ztrmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrmv.acml : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.atlas : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.mkl : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrmv.veclib : ztrmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Strsv #################################################### +strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +strsv.acml : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.atlas : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.mkl : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +strsv.veclib : strsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtrsv #################################################### +dtrsv.goto : dtrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtrsv.acml : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.atlas : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.mkl : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtrsv.veclib : dtrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctrsv #################################################### + +ctrsv.goto : ctrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctrsv.acml : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.atlas : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.mkl : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctrsv.veclib : ctrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztrsv #################################################### + +ztrsv.goto : ztrsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztrsv.acml : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.atlas : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.mkl : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztrsv.veclib : ztrsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sger #################################################### sger.goto : sger.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1509,7 +1861,70 @@ zhemv.mkl : zhemv.$(SUFFIX) zhemv.veclib : zhemv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chbmv #################################################### +chbmv.goto : chbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chbmv.acml : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.atlas : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.mkl : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chbmv.veclib : chbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhbmv #################################################### + +zhbmv.goto : zhbmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhbmv.acml : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.atlas : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.mkl : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhbmv.veclib : zhbmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Chpmv #################################################### + +chpmv.goto : chpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +chpmv.acml : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.atlas : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.mkl : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +chpmv.veclib : chpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Zhpmv #################################################### + +zhpmv.goto : zhpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zhpmv.acml : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.atlas : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.mkl : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zhpmv.veclib : zhpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sdot #################################################### sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -1606,6 +2021,69 @@ drot.mkl : drot.$(SUFFIX) drot.veclib : drot.$(SUFFIX) $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### csrot #################################################### +csrot.goto : csrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +csrot.acml : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.atlas : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.mkl : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +csrot.veclib : csrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### zdrot #################################################### +zdrot.goto : zdrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zdrot.acml : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.atlas : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.mkl : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zdrot.veclib : zdrot.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### srotm #################################################### +srotm.goto : srotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +srotm.acml : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.atlas : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.mkl : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +srotm.veclib : srotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### drotm #################################################### +drotm.goto : drotm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +drotm.acml : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.atlas : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.mkl : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +drotm.veclib : drotm.$(SUFFIX) + $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Saxpy #################################################### saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) @@ -1673,7 +2151,72 @@ zaxpy.mkl : zaxpy.$(SUFFIX) zaxpy.veclib : zaxpy.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Saxpby #################################################### +saxpby.goto : saxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +saxpby.acml : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.atlas : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.mkl : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +saxpby.veclib : saxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Daxpby #################################################### +daxpby.goto : daxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +daxpby.acml : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.atlas : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.mkl : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +daxpby.veclib : daxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Caxpby #################################################### + +caxpby.goto : caxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +caxpby.acml : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.atlas : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.mkl : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +caxpby.veclib : caxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zaxpby #################################################### + +zaxpby.goto : zaxpby.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zaxpby.acml : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.atlas : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.mkl : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zaxpby.veclib : zaxpby.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Scopy #################################################### scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2063,7 +2606,7 @@ ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) ############################################## IDMAX ############################################## idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - + ############################################## ISAMIN ############################################## isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2239,6 +2782,30 @@ ctrsm.$(SUFFIX) : trsm.c ztrsm.$(SUFFIX) : trsm.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ssyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr.$(SUFFIX) : syr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr.$(SUFFIX) : spr.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dspr2.$(SUFFIX) : spr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ssyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dsyr2.$(SUFFIX) : syr2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + ssyrk.$(SUFFIX) : syrk.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2281,6 +2848,18 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2293,16 +2872,28 @@ cspmv.$(SUFFIX) : spmv.c zspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -sgemv.$(SUFFIX) : gemv.c +strmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ -dgemv.$(SUFFIX) : gemv.c +dtrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -cgemv.$(SUFFIX) : gemv.c +ctrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ -zgemv.$(SUFFIX) : gemv.c +ztrmv.$(SUFFIX) : trmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +strsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctrsv.$(SUFFIX) : trsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztrsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ sger.$(SUFFIX) : ger.c @@ -2372,6 +2963,18 @@ chemv.$(SUFFIX) : hemv.c zhemv.$(SUFFIX) : hemv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +chbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhbmv.$(SUFFIX) : hbmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +chpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zhpmv.$(SUFFIX) : hpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sdot.$(SUFFIX) : dot.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2404,6 +3007,18 @@ caxpy.$(SUFFIX) : axpy.c zaxpy.$(SUFFIX) : axpy.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +saxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpby.$(SUFFIX) : axpby.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + scopy.$(SUFFIX) : copy.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2473,7 +3088,17 @@ srot.$(SUFFIX) : rot.c drot.$(SUFFIX) : rot.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ +csrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ +zdrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drotm.$(SUFFIX) : rotm.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ @@ -2556,6 +3181,7 @@ camin.$(SUFFIX) : amin.c zamin.$(SUFFIX) : amin.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + smin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2563,7 +3189,6 @@ dmin.$(SUFFIX) : min.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - snrm2.$(SUFFIX) : nrm2.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ @@ -2584,4 +3209,3 @@ clean :: @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling include $(TOPDIR)/Makefile.tail - From 3f7f7ab7e266cb0e951528d699fa5910af0908fe Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Mar 2020 12:51:07 +0100 Subject: [PATCH 03/19] Restore INTERFACE64 for arm64 --- Makefile.system | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/Makefile.system b/Makefile.system index 829c08f16..11cb5b3a0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -635,6 +635,16 @@ endif ifeq ($(ARCH), arm64) NO_BINARY_MODE = 1 BINARY_DEFINED = 1 +ifdef INTERFACE64 +ifneq ($(INTERFACE64), 0) +ifeq ($(F_COMPILER), GFORTRAN) +FCOMMON_OPT += -fdefault-integer-8 +endif +ifeq ($(F_COMPILER), FLANG) +FCOMMON_OPT += -i8 +endif +endif +endif endif From b25ae1fc602923cbc134338e619606330fe5fd7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 10 Mar 2020 13:37:41 +0100 Subject: [PATCH 04/19] Apply fix for Reference-LAPACK issue 394 reference to XERBLA extending beyond column 72, breaking builds with compilers that default to traditional punch card format --- lapack-netlib/SRC/sorhr_col.f | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lapack-netlib/SRC/sorhr_col.f b/lapack-netlib/SRC/sorhr_col.f index 38976245c..9aef57b26 100644 --- a/lapack-netlib/SRC/sorhr_col.f +++ b/lapack-netlib/SRC/sorhr_col.f @@ -282,7 +282,8 @@ $ NPLUSONE * .. * .. External Subroutines .. - EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA + EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, + $XERBLA * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -436,4 +437,4 @@ * * End of SORHR_COL * - END \ No newline at end of file + END From cd8871f1a1d6022d83bb8190c17779658f6a0e5f Mon Sep 17 00:00:00 2001 From: s00527847 Date: Tue, 10 Mar 2020 19:26:06 -0400 Subject: [PATCH 05/19] Use the correct unit of measure --- benchmark/spr.c | 2 +- benchmark/spr2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmark/spr.c b/benchmark/spr.c index 61a972c08..c91e587b1 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -187,7 +187,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes %10.6f sec\n", + " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/spr2.c b/benchmark/spr2.c index 251543fa9..e8ee345d7 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -196,7 +196,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes %10.6f sec\n", + " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg); } From 2f4c5bb3a98eb1956cc24221d909b1547a7e4eff Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 10:30:09 +0800 Subject: [PATCH 06/19] Update spmv.c: solve segmentation fault when m and n are larger than 50000 --- benchmark/spmv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/spmv.c b/benchmark/spmv.c index 12a33e298..2a26c9416 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -178,7 +178,7 @@ int main(int argc, char *argv[]){ for(j = 0; j < m; j++){ for(i = 0; i < m * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } } From 3e8f1c6cc5586da2e399b8f811ec24129aa21304 Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Wed, 11 Mar 2020 12:31:48 +0800 Subject: [PATCH 07/19] [OpenBlas]:Add benchmark tpmv.c and modify Makefile [Description]:Solve the problem of missing tpmv.c benchmark file --- benchmark/Makefile | 87 +++++++++++++++++++++++ benchmark/tpmv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 259 insertions(+) create mode 100644 benchmark/tpmv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 2db873e95..5e0c96c25 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -80,6 +80,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ @@ -117,6 +118,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -155,6 +157,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -192,6 +195,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -229,6 +233,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ + stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ @@ -272,6 +277,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ + stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -310,6 +316,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ + stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -349,6 +356,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ + stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -397,6 +405,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ + stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ @@ -1363,6 +1372,72 @@ ztrmv.mkl : ztrmv.$(SUFFIX) ztrmv.veclib : ztrmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpmv #################################################### +stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpmv.acml : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.atlas : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.mkl : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpmv.veclib : stpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpmv #################################################### +dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpmv.acml : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.atlas : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.mkl : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpmv.veclib : dtpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpmv #################################################### + +ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpmv.acml : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.atlas : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.mkl : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpmv.veclib : ctpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpmv #################################################### + +ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpmv.acml : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.atlas : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.mkl : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpmv.veclib : ztpmv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Strsv #################################################### strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2799,6 +2874,18 @@ ctrmv.$(SUFFIX) : trmv.c ztrmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +stpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpmv.$(SUFFIX) : tpmv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + strsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c new file mode 100644 index 000000000..ee5b97f24 --- /dev/null +++ b/benchmark/tpmv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TPMV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TPMV BLASFUNC(dtpmv) +#else +#define TPMV BLASFUNC(stpmv) +#endif + +#else + +#ifdef DOUBLE +#define TPMV BLASFUNC(ztpmv) +#else +#define TPMV BLASFUNC(ctpmv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From 649733ff151f148513e4da2b6d15af856bfe3c33 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 15:48:58 +0800 Subject: [PATCH 08/19] Only keep spmv.goto and spmv.atlas --- benchmark/Makefile | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index fdc57819f..03145d1ad 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -117,7 +117,6 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ @@ -194,7 +193,6 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ @@ -276,7 +274,6 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ - sspmv.acml dspmv.acml cspmv.acml zspmv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ @@ -355,7 +352,6 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ - sspmv.mkl dspmv.mkl cspmv.mkl zspmv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ @@ -404,7 +400,6 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ - sspmv.veclib dspmv.veclib cspmv.veclib zspmv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ @@ -1310,66 +1305,30 @@ zgemv.veclib : zgemv.$(SUFFIX) sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -sspmv.acml : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - sspmv.atlas : sspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -sspmv.mkl : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -sspmv.veclib : sspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Dspmv #################################################### dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -dspmv.acml : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - dspmv.atlas : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -dspmv.mkl : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -dspmv.veclib : dspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Cspmv #################################################### cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -cspmv.acml : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - cspmv.atlas : cspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -cspmv.mkl : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -cspmv.veclib : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Zspmv #################################################### zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm -zspmv.acml : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - zspmv.atlas : zspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -zspmv.mkl : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -zspmv.veclib : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Strmv #################################################### strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm From 83ecf9fea7c4e727042181ed5b985d5be05775f7 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 16:36:45 +0800 Subject: [PATCH 09/19] Modify Makefile in interface to remove the error occured in travis CI --- interface/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/interface/Makefile b/interface/Makefile index e25e5ccfc..3f0dcca28 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -86,7 +86,7 @@ CBLAS2OBJS = \ cgemv.$(SUFFIX) cgeru.$(SUFFIX) cgerc.$(SUFFIX) \ ctrsv.$(SUFFIX) ctrmv.$(SUFFIX) \ csyr2.$(SUFFIX) cgbmv.$(SUFFIX) \ - csbmv.$(SUFFIX) cspmv.$(SUFFIX) \ + csbmv.$(SUFFIX) \ cspr2.$(SUFFIX) \ ctbsv.$(SUFFIX) ctbmv.$(SUFFIX) \ ctpsv.$(SUFFIX) ctpmv.$(SUFFIX) \ @@ -115,7 +115,7 @@ ZBLAS2OBJS = \ zgemv.$(SUFFIX) zgeru.$(SUFFIX) zgerc.$(SUFFIX) \ ztrsv.$(SUFFIX) ztrmv.$(SUFFIX) \ zsyr2.$(SUFFIX) zgbmv.$(SUFFIX) \ - zsbmv.$(SUFFIX) zspmv.$(SUFFIX) \ + zsbmv.$(SUFFIX) \ zspr2.$(SUFFIX) \ ztbsv.$(SUFFIX) ztbmv.$(SUFFIX) \ ztpsv.$(SUFFIX) ztpmv.$(SUFFIX) \ From ae3f2c2e491193de9082001ef4c6a870863fa7d9 Mon Sep 17 00:00:00 2001 From: "jayfely@qq.com" Date: Wed, 11 Mar 2020 17:02:34 +0800 Subject: [PATCH 10/19] Remove cspmv and zspmv to remove the error occured in travis CI --- benchmark/Makefile | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/benchmark/Makefile b/benchmark/Makefile index 03145d1ad..2528e74ad 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -79,7 +79,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ @@ -155,7 +155,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ @@ -230,7 +230,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ - sspmv.goto dspmv.goto cspmv.goto zspmv.goto \ + sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ @@ -312,7 +312,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ - sspmv.atlas dspmv.atlas cspmv.atlas zspmv.atlas \ + sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ @@ -1315,20 +1315,6 @@ dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME) dspmv.atlas : dspmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -##################################### Cspmv #################################################### -cspmv.goto : cspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -cspmv.atlas : cspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - -##################################### Zspmv #################################################### -zspmv.goto : zspmv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -zspmv.atlas : zspmv.$(SUFFIX) - -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) - ##################################### Strmv #################################################### strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2825,12 +2811,6 @@ sspmv.$(SUFFIX) : spmv.c dspmv.$(SUFFIX) : spmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ -cspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zspmv.$(SUFFIX) : spmv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - strmv.$(SUFFIX) : trmv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ From a0a3bf7c81ed7bbd98c8e6effd0af2ecfc15577b Mon Sep 17 00:00:00 2001 From: l00546269 Date: Fri, 13 Mar 2020 10:58:39 +0800 Subject: [PATCH 11/19] [OpenBLAS]:fix the iamax benchmark error [Description]:the result for i?amax is not MFlops, it is MBytes --- benchmark/iamax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/iamax.c b/benchmark/iamax.c index 034e24ea9..736f02b89 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -181,7 +181,7 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops %10.6f sec\n", + " %10.2f MBytes %10.6f sec\n", COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } From c436e8af7b0e985a3131075071e023d4a0efea8b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:10:26 +0100 Subject: [PATCH 12/19] Do not attempt to run ctest without fortran The main Makefile takes care of this in the build process, but users or CI jobs may try to run this directly --- ctest/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ctest/Makefile b/ctest/Makefile index f562c9bb3..6f5b65142 100644 --- a/ctest/Makefile +++ b/ctest/Makefile @@ -40,8 +40,11 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o - +ifeq ($(NOFORTRAN),1) +all :: +else all :: all1 all2 all3 +endif all1: xscblat1 xdcblat1 xccblat1 xzcblat1 ifndef CROSS From 2d8781b0dc1af15eeb73dfff92e5617088a1fb1f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:11:19 +0100 Subject: [PATCH 13/19] Do not attempt to run test without fortran --- test/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/test/Makefile b/test/Makefile index 074411b05..7a873b7e5 100644 --- a/test/Makefile +++ b/test/Makefile @@ -1,7 +1,12 @@ TOPDIR = .. include ../Makefile.system + +ifeq ($(NOFORTRAN),1) +all :: +else all :: level1 level2 level3 +endif level1 : sblat1 dblat1 cblat1 zblat1 ifndef CROSS From ee2e758278b5d82b7242f505ea694f082ef65879 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 13 Mar 2020 20:34:13 +0100 Subject: [PATCH 14/19] Move declarations of lapack_complex_custom types outside the extern C fixes #2510 --- lapack-netlib/LAPACKE/include/lapack.h | 44 ++++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index 0a6226fe4..36e53ec24 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -12,27 +12,6 @@ #include -#ifdef __cplusplus -extern "C" { -#endif - -/*----------------------------------------------------------------------------*/ -#ifndef lapack_int -#define lapack_int int -#endif - -#ifndef lapack_logical -#define lapack_logical lapack_int -#endif - -/* f2c, hence clapack and MacOS Accelerate, returns double instead of float - * for sdot, slange, clange, etc. */ -#if defined(LAPACK_F2C) - typedef double lapack_float_return; -#else - typedef float lapack_float_return; -#endif - /* Complex types are structures equivalent to the * Fortran complex types COMPLEX(4) and COMPLEX(8). * @@ -88,6 +67,29 @@ extern "C" { #endif /* LAPACK_COMPLEX_CUSTOM */ + +#ifdef __cplusplus +extern "C" { +#endif + +/*----------------------------------------------------------------------------*/ +#ifndef lapack_int +#define lapack_int int +#endif + +#ifndef lapack_logical +#define lapack_logical lapack_int +#endif + +/* f2c, hence clapack and MacOS Accelerate, returns double instead of float + * for sdot, slange, clange, etc. */ +#if defined(LAPACK_F2C) + typedef double lapack_float_return; +#else + typedef float lapack_float_return; +#endif + + /* Callback logical functions of one, two, or three arguments are used * to select eigenvalues to sort to the top left of the Schur form. * The value is selected if function returns TRUE (non-zero). */ From 2428dc9fd3a032738f6dbe5728caadd55278765e Mon Sep 17 00:00:00 2001 From: wuanjun 00447568 Date: Sat, 14 Mar 2020 09:11:08 +0800 Subject: [PATCH 15/19] [OpenBlas]: Add benchmark tpsv file and modify benchmark/Makefile [Description]: Solve lack of tpsv benchmark. --- benchmark/Makefile | 88 +++++++++++++++++++++++ benchmark/tpsv.c | 172 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 260 insertions(+) create mode 100644 benchmark/tpsv.c diff --git a/benchmark/Makefile b/benchmark/Makefile index 5e0c96c25..116515b01 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -81,6 +81,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ sgeev.goto dgeev.goto cgeev.goto zgeev.goto \ sgesv.goto dgesv.goto cgesv.goto zgesv.goto \ @@ -119,6 +120,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -158,6 +160,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -196,6 +199,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -234,6 +238,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \ + stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \ strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ smallscaling \ @@ -278,6 +283,7 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ + stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \ strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \ sgeev.acml dgeev.acml cgeev.acml zgeev.acml \ sgesv.acml dgesv.acml cgesv.acml zgesv.acml \ @@ -317,6 +323,7 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \ + stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \ strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \ sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ @@ -357,6 +364,7 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ + stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \ strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \ sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \ sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \ @@ -406,6 +414,7 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ + stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \ strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \ sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \ sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \ @@ -1438,6 +1447,73 @@ ztpmv.mkl : ztpmv.$(SUFFIX) ztpmv.veclib : ztpmv.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Stpsv #################################################### +stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +stpsv.acml : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.atlas : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.mkl : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +stpsv.veclib : stpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Dtpsv #################################################### +dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dtpsv.acml : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.atlas : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.mkl : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +dtpsv.veclib : dtpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ctpsv #################################################### + +ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ctpsv.acml : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.atlas : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.mkl : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ctpsv.veclib : ctpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Ztpsv #################################################### + +ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +ztpsv.acml : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.atlas : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.mkl : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +ztpsv.veclib : ztpsv.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Strsv #################################################### strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2886,6 +2962,18 @@ ctpmv.$(SUFFIX) : tpmv.c ztpmv.$(SUFFIX) : tpmv.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +stpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dtpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ctpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +ztpsv.$(SUFFIX) : tpsv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + strsv.$(SUFFIX) : trsv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c new file mode 100644 index 000000000..46d78fd17 --- /dev/null +++ b/benchmark/tpsv.c @@ -0,0 +1,172 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#undef TPSV + +#ifndef COMPLEX + +#ifdef DOUBLE +#define TPSV BLASFUNC(dtpsv) +#else +#define TPSV BLASFUNC(stpsv) +#endif + +#else + +#ifdef DOUBLE +#define TPSV BLASFUNC(ztpsv) +#else +#define TPSV BLASFUNC(ctpsv) +#endif + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size) +{ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1) { + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]) +{ + + FLOAT *a, *x; + char *p; + + char uplo ='U'; + char trans='N'; + char diag ='U'; + + int loops = 1; + int l; + blasint inc_x=1; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_DIAG"))) diag=*p; + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + blasint n, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timespec start = { 0, 0 }, stop = { 0, 0 }; + double time1, timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from, + to, step, uplo, trans, diag, loops, inc_x); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(n = from; n <= to; n += step) { + timeg=0; + + fprintf(stderr, " %6d : ", (int)n); + for(j = 0; j < n; j++) { + for(i = 0; i < n * COMPSIZE; i++) { + a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) { + x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + for (l = 0; l < loops; l++) { + clock_gettime(CLOCK_REALTIME, &start); + TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); + clock_gettime(CLOCK_REALTIME, &stop); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + timeg += time1; + } + + timeg /= loops; + fprintf(stderr, " %10.2f MFlops %12.9f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg); + } + + return 0; +} + +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); From d45c53ecf1f9aef0833643f7c16140ffa4d6b60d Mon Sep 17 00:00:00 2001 From: l00536773 Date: Mon, 16 Mar 2020 11:19:05 +0800 Subject: [PATCH 16/19] [OpenBLAS]: benchmark for her/her2 LEVEL2 functions [description]: benchmark for her/her2 [solution]: added benchmark for her/her2, modified makefile in benchmark [dts]: --- benchmark/Makefile | 98 +++++++++++++++++++++++ benchmark/her.c | 186 ++++++++++++++++++++++++++++++++++++++++++++ benchmark/her2.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 474 insertions(+) create mode 100644 benchmark/her.c create mode 100644 benchmark/her2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index df17366d7..efb4da5ea 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -78,6 +78,8 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ @@ -117,6 +119,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ @@ -156,6 +160,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ @@ -195,6 +201,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ @@ -233,6 +241,8 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ chemm.goto zhemm.goto \ cherk.goto zherk.goto \ cher2k.goto zher2k.goto \ + cher.goto zher.goto \ + cher2.goto zher2.goto \ sgemv.goto dgemv.goto cgemv.goto zgemv.goto \ sspmv.goto dspmv.goto \ strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \ @@ -278,6 +288,8 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ chemm.acml zhemm.acml \ cherk.acml zherk.acml \ cher2k.acml zher2k.acml \ + cher.acml zher.acml \ + cher2.acml zher2.acml \ sgemv.acml dgemv.acml cgemv.acml zgemv.acml \ strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \ stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \ @@ -317,6 +329,8 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ chemm.atlas zhemm.atlas \ cherk.atlas zherk.atlas \ cher2k.atlas zher2k.atlas \ + cher.atlas zher.atlas \ + cher2.atlas zher2.atlas \ sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \ sspmv.atlas dspmv.atlas \ strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \ @@ -358,6 +372,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ chemm.mkl zhemm.mkl \ cherk.mkl zherk.mkl \ cher2k.mkl zher2k.mkl \ + cher.mkl zher.mkl \ + cher2.mkl zher2.mkl \ sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \ strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \ stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \ @@ -407,6 +423,8 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ chemm.veclib zhemm.veclib \ cherk.veclib zherk.veclib \ cher2k.veclib zher2k.veclib \ + cher.veclib zher.veclib \ + cher2.veclib zher2.veclib \ sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \ strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \ stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \ @@ -1244,6 +1262,74 @@ zher2k.mkl : zher2k.$(SUFFIX) zher2k.veclib : zher2k.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +##################################### Cher #################################################### + +cher.goto : cher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher.acml : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.atlas : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.mkl : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher.veclib : cher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher #################################################### + +zher.goto : zher.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher.acml : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.atlas : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.mkl : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher.veclib : zher.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Cher2 #################################################### + +cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +cher2.acml : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.atlas : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.mkl : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +cher2.veclib : cher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +##################################### Zher2 #################################################### + +zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +zher2.acml : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.atlas : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.mkl : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +zher2.veclib : zher2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + ##################################### Sgemv #################################################### sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2868,6 +2954,18 @@ cher2k.$(SUFFIX) : her2k.c zher2k.$(SUFFIX) : her2k.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +cher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher.$(SUFFIX) : her.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +cher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zher2.$(SUFFIX) : her2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + sgemv.$(SUFFIX) : gemv.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/her.c b/benchmark/her.c new file mode 100644 index 000000000..f4e10b684 --- /dev/null +++ b/benchmark/her.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HER + + +#ifdef DOUBLE +#define HER BLASFUNC(zher) +#else +#define HER BLASFUNC(cher) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x; + FLOAT alpha[] = {1.0, 1.0}; + blasint incx = 1; + char *p; + + char uplo='U'; + char trans='N'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + HER (&uplo, &m, alpha, x, &incx, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} diff --git a/benchmark/her2.c b/benchmark/her2.c new file mode 100644 index 000000000..e10b7e98e --- /dev/null +++ b/benchmark/her2.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef HER2 + + +#ifdef DOUBLE +#define HER2 BLASFUNC(zher2) +#else +#define HER2 BLASFUNC(cher2) +#endif + + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y; + FLOAT alpha[] = {1.0, 1.0}; + blasint inc = 1; + char *p; + + char uplo='U'; + char trans='N'; + + if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + + blasint m, i, j; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans); + + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops\n"); + + for(m = from; m <= to; m += step) + { + fprintf(stderr, " %6d : ", (int)m); + + for(j = 0; j < m; j++){ + for(i = 0; i < m * COMPSIZE; i++){ + a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + + gettimeofday( &start, (struct timezone *)0); + + + HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); + + gettimeofday( &stop, (struct timezone *)0); + + time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + + gettimeofday( &start, (struct timezone *)0); + + fprintf(stderr, + " %10.2f MFlops\n", + COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); + + } + + return 0; +} From fa049d49c2b8aa26e9de09d328a0f31c3810d145 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 17 Mar 2020 00:34:08 +0800 Subject: [PATCH 17/19] AVX2 STRSM kernel --- kernel/x86_64/KERNEL.HASWELL | 8 +- kernel/x86_64/strsm_kernel_8x4_haswell_LN.c | 240 +++++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_LT.c | 228 ++++++++++++++ .../strsm_kernel_8x4_haswell_L_common.h | 187 ++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_RN.c | 279 +++++++++++++++++ kernel/x86_64/strsm_kernel_8x4_haswell_RT.c | 281 ++++++++++++++++++ .../strsm_kernel_8x4_haswell_R_common.h | 226 ++++++++++++++ 7 files changed, 1445 insertions(+), 4 deletions(-) create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_LN.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_LT.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_RN.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_RT.c create mode 100644 kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f6ca5c2d5..ef8b36a57 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -77,10 +77,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c +STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c +STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c +STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c new file mode 100644 index 000000000..4131debb1 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LN.c @@ -0,0 +1,240 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_L_common.h" + +#define SOLVE_LN_m1n4 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ + SOLVE_m1n4(-4,4) SAVE_b_m1n4(-16,4)\ + "movq %2,%3;" save_c_m1n4(4) + +#define SOLVE_LN_m1n8 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ + SOLVE_m1n8(-4,4,5) SAVE_b_m1n8(-16,4,5)\ + "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) + +#define SOLVE_LN_m1n12 \ + "subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ + SOLVE_m1n12(-4,4,5,6) SAVE_b_m1n12(-16,4,5,6)\ + "movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) + +#define SOLVE_LN_m2n4 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ + SOLVE_loup_m2n4(-8,4)\ + SOLVE_up_m2n4(-16,4) SAVE_b_m2n4(-32,4)\ + "movq %2,%3;" save_c_m2n4(4) + +#define SOLVE_LN_m2n8 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ + SOLVE_loup_m2n8(-8,4,5)\ + SOLVE_up_m2n8(-16,4,5) SAVE_b_m2n8(-32,4,5)\ + "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) + +#define SOLVE_LN_m2n12 \ + "subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ + SOLVE_loup_m2n12(-8,4,5,6)\ + SOLVE_up_m2n12(-16,4,5,6) SAVE_b_m2n12(-32,4,5,6)\ + "movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) + +#define SOLVE_LN_m4n4 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ +\ + SOLVE_loup_m2n4(-8,5) SUBTRACT_m2n4(-16,4)\ + SOLVE_up_m2n4(-24,5) SUBTRACT_m2n4(-32,4) SAVE_b_m2n4(-32,5)\ +\ + SOLVE_loup_m2n4(-48,4)\ + SOLVE_up_m2n4(-64,4) SAVE_b_m2n4(-64,4)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) + +#define SOLVE_LN_m4n8 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ +\ + SOLVE_loup_m2n8(-8,5,7) SUBTRACT_m2n8(-16,4,6)\ + SOLVE_up_m2n8(-24,5,7) SUBTRACT_m2n8(-32,4,6) SAVE_b_m2n8(-32,5,7)\ +\ + SOLVE_loup_m2n8(-48,4,6)\ + SOLVE_up_m2n8(-64,4,6) SAVE_b_m2n8(-64,4,6)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) + +#define SOLVE_LN_m4n12 \ + "subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ +\ + SOLVE_loup_m2n12(-8,5,7,9) SUBTRACT_m2n12(-16,4,6,8)\ + SOLVE_up_m2n12(-24,5,7,9) SUBTRACT_m2n12(-32,4,6,8) SAVE_b_m2n12(-32,5,7,9)\ +\ + SOLVE_loup_m2n12(-48,4,6,8)\ + SOLVE_up_m2n12(-64,4,6,8) SAVE_b_m2n12(-64,4,6,8)\ +\ + "movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) + +#define SOLVE_LN_m8n4 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32)\ +\ + SOLVE_loup_m2n4(-8,7) SUBTRACT_m2n4(-16,6) SUBTRACT_m2n4(-24,5) SUBTRACT_m2n4(-32,4)\ + SOLVE_up_m2n4(-40,7) SUBTRACT_m2n4(-48,6) SUBTRACT_m2n4(-56,5) SUBTRACT_m2n4(-64,4) SAVE_b_m2n4(-32,7)\ +\ + SOLVE_loup_m2n4(-80,6) SUBTRACT_m2n4(-88,5) SUBTRACT_m2n4(-96,4)\ + SOLVE_up_m2n4(-112,6) SUBTRACT_m2n4(-120,5) SUBTRACT_m2n4(-128,4) SAVE_b_m2n4(-64,6)\ +\ + SOLVE_loup_m2n4(-152,5) SUBTRACT_m2n4(-160,4)\ + SOLVE_up_m2n4(-184,5) SUBTRACT_m2n4(-192,4) SAVE_b_m2n4(-96,5)\ +\ + SOLVE_loup_m2n4(-224,4)\ + SOLVE_up_m2n4(-256,4) SAVE_b_m2n4(-128,4)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) + +#define SOLVE_LN_m8n8 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32)\ +\ + SOLVE_loup_m2n8(-8,7,11) SUBTRACT_m2n8(-16,6,10) SUBTRACT_m2n8(-24,5,9) SUBTRACT_m2n8(-32,4,8)\ + SOLVE_up_m2n8(-40,7,11) SUBTRACT_m2n8(-48,6,10) SUBTRACT_m2n8(-56,5,9) SUBTRACT_m2n8(-64,4,8) SAVE_b_m2n8(-32,7,11)\ +\ + SOLVE_loup_m2n8(-80,6,10) SUBTRACT_m2n8(-88,5,9) SUBTRACT_m2n8(-96,4,8)\ + SOLVE_up_m2n8(-112,6,10) SUBTRACT_m2n8(-120,5,9) SUBTRACT_m2n8(-128,4,8) SAVE_b_m2n8(-64,6,10)\ +\ + SOLVE_loup_m2n8(-152,5,9) SUBTRACT_m2n8(-160,4,8)\ + SOLVE_up_m2n8(-184,5,9) SUBTRACT_m2n8(-192,4,8) SAVE_b_m2n8(-96,5,9)\ +\ + SOLVE_loup_m2n8(-224,4,8)\ + SOLVE_up_m2n8(-256,4,8) SAVE_b_m2n8(-128,4,8)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) + +#define SOLVE_LN_m8n12 \ + "subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32) GEMM_SUM_REORDER_8x4(12,13,14,15,-32)\ +\ + SOLVE_loup_m2n12(-8,7,11,15) SUBTRACT_m2n12(-16,6,10,14) SUBTRACT_m2n12(-24,5,9,13) SUBTRACT_m2n12(-32,4,8,12)\ + SOLVE_up_m2n12(-40,7,11,15) SUBTRACT_m2n12(-48,6,10,14) SUBTRACT_m2n12(-56,5,9,13) SUBTRACT_m2n12(-64,4,8,12) SAVE_b_m2n12(-32,7,11,15)\ +\ + SOLVE_loup_m2n12(-80,6,10,14) SUBTRACT_m2n12(-88,5,9,13) SUBTRACT_m2n12(-96,4,8,12)\ + SOLVE_up_m2n12(-112,6,10,14) SUBTRACT_m2n12(-120,5,9,13) SUBTRACT_m2n12(-128,4,8,12) SAVE_b_m2n12(-64,6,10,14)\ +\ + SOLVE_loup_m2n12(-152,5,9,13) SUBTRACT_m2n12(-160,4,8,12)\ + SOLVE_up_m2n12(-184,5,9,13) SUBTRACT_m2n12(-192,4,8,12) SAVE_b_m2n12(-96,5,9,13)\ +\ + SOLVE_loup_m2n12(-224,4,8,12)\ + SOLVE_up_m2n12(-256,4,8,12) SAVE_b_m2n12(-128,4,8,12)\ +\ + "movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) + +/* r13 = k-kk, r14 = b_tail, r15 = a_tail */ + +#define GEMM_LN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,"#mdim"),%%r15; negq %%r12;"\ + "movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 2"#mdim""#ndim"2f;"\ + "2"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 2"#mdim""#ndim"1b;"\ + "2"#mdim""#ndim"2:\n\t" +#define GEMM_LN_m8n4 GEMM_LN_SIMPLE(8,4) +#define GEMM_LN_m8n8 GEMM_LN_SIMPLE(8,8) +#define GEMM_LN_m8n12 \ + "movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,8),%%r15; negq %%r12; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 28122f;"\ + "28121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 28121b;"\ + "28122:\n\t"\ + "testq %5,%5; jz 28124f;"\ + "28123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 28123b;"\ + "28124:\n\t" +#define GEMM_LN_m4n4 GEMM_LN_SIMPLE(4,4) +#define GEMM_LN_m4n8 GEMM_LN_SIMPLE(4,8) +#define GEMM_LN_m4n12 GEMM_LN_SIMPLE(4,12) +#define GEMM_LN_m2n4 GEMM_LN_SIMPLE(2,4) +#define GEMM_LN_m2n8 GEMM_LN_SIMPLE(2,8) +#define GEMM_LN_m2n12 GEMM_LN_SIMPLE(2,12) +#define GEMM_LN_m1n4 GEMM_LN_SIMPLE(1,4) +#define GEMM_LN_m1n8 GEMM_LN_SIMPLE(1,8) +#define GEMM_LN_m1n12 GEMM_LN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + c_ptr += M;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; leaq (%1,%%r12,4),%%r14; movq %10,%%r11;"\ + "testq $1,%%r11; jz "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_LN_m1n##ndim SOLVE_LN_m1n##ndim "subq $1,%%r11;"\ + #ndim"772:\n\t"\ + "testq $2,%%r11; jz "#ndim"773f;"\ + GEMM_LN_m2n##ndim SOLVE_LN_m2n##ndim "subq $2,%%r11;"\ + #ndim"773:\n\t"\ + "testq $4,%%r11; jz "#ndim"774f;"\ + GEMM_LN_m4n##ndim SOLVE_LN_m4n##ndim "subq $4,%%r11;"\ + #ndim"774:\n\t"\ + "testq %%r11,%%r11; jz "#ndim"776f;"\ + #ndim"775:\n\t"\ + GEMM_LN_m8n##ndim SOLVE_LN_m8n##ndim "subq $8,%%r11; jnz "#ndim"775b;"\ + #ndim"776:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(kmkkinp),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr += M * K; b_ptr += (ndim-4) * K; c_ptr += ldc * ndim;\ +} +static void solve_LN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=m-1;i>=0;i--) { + a0 = a[i*m+i]; //reciprocal of the original value + for (j=0;j0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_LN(1,n,a_ptr+(kk-1)*1,sb+(kk-1)*n,c_ptr,ldc); + kk -= 1; + m_count--; + } + if(m_count&2){ + a_ptr-=k*2; c_ptr-=2; + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_LN(2,n,a_ptr+(kk-2)*2,sb+(kk-2)*n,c_ptr,ldc); + kk -= 2; + m_count-=2; + } + if(m_count&4){ + a_ptr-=k*4; c_ptr-=4; + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_LN(4,n,a_ptr+(kk-4)*4,sb+(kk-4)*n,c_ptr,ldc); + kk -= 4; + m_count-=4; + } + for(;m_count>7;m_count-=8){ + a_ptr-=k*8; c_ptr-=8; + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_LN(8,n,a_ptr+(kk-8)*8,sb+(kk-8)*n,c_ptr,ldc); + kk -= 8; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa+m*k, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, kmkkinp = (uint64_t)(k-m-offset), k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,sa,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,sa,b_ptr,c_ptr,ldc,k,offset); + return 0; +} + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c new file mode 100644 index 000000000..3c7e9e83b --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_LT.c @@ -0,0 +1,228 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_L_common.h" + +#define SOLVE_LT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\ + SOLVE_m1n4(0,4) SAVE_b_m1n4(0,4)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) + +#define SOLVE_LT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\ + SOLVE_m1n8(0,4,5) SAVE_b_m1n8(0,4,5)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) + +#define SOLVE_LT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\ + SOLVE_m1n12(0,4,5,6) SAVE_b_m1n12(0,4,5,6)\ + "movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6) + +#define SOLVE_LT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\ + SOLVE_uplo_m2n4(0,4)\ + SOLVE_lo_m2n4(8,4) SAVE_b_m2n4(0,4)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) + +#define SOLVE_LT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\ + SOLVE_uplo_m2n8(0,4,5)\ + SOLVE_lo_m2n8(8,4,5) SAVE_b_m2n8(0,4,5)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) + +#define SOLVE_LT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\ + SOLVE_uplo_m2n12(0,4,5,6)\ + SOLVE_lo_m2n12(8,4,5,6) SAVE_b_m2n12(0,4,5,6)\ + "movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6) + +#define SOLVE_LT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\ +\ + SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5)\ + SOLVE_lo_m2n4(16,4) SUBTRACT_m2n4(24,5) SAVE_b_m2n4(0,4)\ +\ + SOLVE_uplo_m2n4(40,5)\ + SOLVE_lo_m2n4(56,5) SAVE_b_m2n4(32,5)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) + +#define SOLVE_LT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\ +\ + SOLVE_uplo_m2n8(0,4,6) SUBTRACT_m2n8(8,5,7)\ + SOLVE_lo_m2n8(16,4,6) SUBTRACT_m2n8(24,5,7) SAVE_b_m2n8(0,4,6)\ +\ + SOLVE_uplo_m2n8(40,5,7)\ + SOLVE_lo_m2n8(56,5,7) SAVE_b_m2n8(32,5,7)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) + +#define SOLVE_LT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\ +\ + SOLVE_uplo_m2n12(0,4,6,8) SUBTRACT_m2n12(8,5,7,9)\ + SOLVE_lo_m2n12(16,4,6,8) SUBTRACT_m2n12(24,5,7,9) SAVE_b_m2n12(0,4,6,8)\ +\ + SOLVE_uplo_m2n12(40,5,7,9)\ + SOLVE_lo_m2n12(56,5,7,9) SAVE_b_m2n12(32,5,7,9)\ +\ + "movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9) + +#define SOLVE_LT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63)\ +\ + SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5) SUBTRACT_m2n4(16,6) SUBTRACT_m2n4(24,7)\ + SOLVE_lo_m2n4(32,4) SUBTRACT_m2n4(40,5) SUBTRACT_m2n4(48,6) SUBTRACT_m2n4(56,7) SAVE_b_m2n4(0,4)\ +\ + SOLVE_uplo_m2n4(72,5) SUBTRACT_m2n4(80,6) SUBTRACT_m2n4(88,7)\ + SOLVE_lo_m2n4(104,5) SUBTRACT_m2n4(112,6) SUBTRACT_m2n4(120,7) SAVE_b_m2n4(32,5)\ +\ + SOLVE_uplo_m2n4(144,6) SUBTRACT_m2n4(152,7)\ + SOLVE_lo_m2n4(176,6) SUBTRACT_m2n4(184,7) SAVE_b_m2n4(64,6)\ +\ + SOLVE_uplo_m2n4(216,7)\ + SOLVE_lo_m2n4(248,7) SAVE_b_m2n4(96,7)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) + +#define SOLVE_LT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63)\ +\ + SOLVE_uplo_m2n8(0,4,8) SUBTRACT_m2n8(8,5,9) SUBTRACT_m2n8(16,6,10) SUBTRACT_m2n8(24,7,11)\ + SOLVE_lo_m2n8(32,4,8) SUBTRACT_m2n8(40,5,9) SUBTRACT_m2n8(48,6,10) SUBTRACT_m2n8(56,7,11) SAVE_b_m2n8(0,4,8)\ +\ + SOLVE_uplo_m2n8(72,5,9) SUBTRACT_m2n8(80,6,10) SUBTRACT_m2n8(88,7,11)\ + SOLVE_lo_m2n8(104,5,9) SUBTRACT_m2n8(112,6,10) SUBTRACT_m2n8(120,7,11) SAVE_b_m2n8(32,5,9)\ +\ + SOLVE_uplo_m2n8(144,6,10) SUBTRACT_m2n8(152,7,11)\ + SOLVE_lo_m2n8(176,6,10) SUBTRACT_m2n8(184,7,11) SAVE_b_m2n8(64,6,10)\ +\ + SOLVE_uplo_m2n8(216,7,11)\ + SOLVE_lo_m2n8(248,7,11) SAVE_b_m2n8(96,7,11)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) + +#define SOLVE_LT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63)\ +\ + SOLVE_uplo_m2n12(0,4,8,12) SUBTRACT_m2n12(8,5,9,13) SUBTRACT_m2n12(16,6,10,14) SUBTRACT_m2n12(24,7,11,15)\ + SOLVE_lo_m2n12(32,4,8,12) SUBTRACT_m2n12(40,5,9,13) SUBTRACT_m2n12(48,6,10,14) SUBTRACT_m2n12(56,7,11,15) SAVE_b_m2n12(0,4,8,12)\ +\ + SOLVE_uplo_m2n12(72,5,9,13) SUBTRACT_m2n12(80,6,10,14) SUBTRACT_m2n12(88,7,11,15)\ + SOLVE_lo_m2n12(104,5,9,13) SUBTRACT_m2n12(112,6,10,14) SUBTRACT_m2n12(120,7,11,15) SAVE_b_m2n12(32,5,9,13)\ +\ + SOLVE_uplo_m2n12(144,6,10,14) SUBTRACT_m2n12(152,7,11,15)\ + SOLVE_lo_m2n12(176,6,10,14) SUBTRACT_m2n12(184,7,11,15) SAVE_b_m2n12(64,6,10,14)\ +\ + SOLVE_uplo_m2n12(216,7,11,15)\ + SOLVE_lo_m2n12(248,7,11,15) SAVE_b_m2n12(96,7,11,15)\ +\ + "movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15) + +#define GEMM_LT_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_LT_m8n4 GEMM_LT_SIMPLE(8,4) +#define GEMM_LT_m8n8 GEMM_LT_SIMPLE(8,8) +#define GEMM_LT_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_LT_m4n4 GEMM_LT_SIMPLE(4,4) +#define GEMM_LT_m4n8 GEMM_LT_SIMPLE(4,8) +#define GEMM_LT_m4n12 GEMM_LT_SIMPLE(4,12) +#define GEMM_LT_m2n4 GEMM_LT_SIMPLE(2,4) +#define GEMM_LT_m2n8 GEMM_LT_SIMPLE(2,8) +#define GEMM_LT_m2n12 GEMM_LT_SIMPLE(2,12) +#define GEMM_LT_m1n4 GEMM_LT_SIMPLE(1,4) +#define GEMM_LT_m1n8 GEMM_LT_SIMPLE(1,8) +#define GEMM_LT_m1n12 GEMM_LT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_LT_m8n##ndim SOLVE_LT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_LT_m4n##ndim SOLVE_LT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_LT_m2n##ndim SOLVE_LT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_LT_m1n##ndim SOLVE_LT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M;\ +} +static void solve_LT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0;i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + kk += 8; a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + kk += 4; a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + kk += 2; a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_LT(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + kk += 1; a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,offset); + return 0; +} + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h new file mode 100644 index 000000000..cfa56da97 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_L_common.h @@ -0,0 +1,187 @@ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2"; vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2"; vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklps %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhps %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ + "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm0; vaddps %%ymm1,%%ymm"#c4",%%ymm1;"\ + "leaq (%3,%4,2),%3;"\ + "vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\ + "vaddps %%ymm2,%%ymm"#c3",%%ymm2; vaddps %%ymm3,%%ymm"#c4",%%ymm3;"\ + "leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ + "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vunpcklps %%xmm"#c4",%%xmm"#c3",%%xmm2; vunpckhps %%xmm"#c4",%%xmm"#c3",%%xmm3;"\ + "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ + "vaddps %%xmm0,%%xmm"#c3",%%xmm0; vaddps %%xmm1,%%xmm"#c4",%%xmm1;"\ + "leaq (%3,%4,2),%3;"\ + "vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\ + "vaddps %%xmm2,%%xmm"#c3",%%xmm2; vaddps %%xmm3,%%xmm"#c4",%%xmm3;"\ + "leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#co1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2,co1)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm0,%%xmm2,%%xmm0; leaq (%3,%4,2),%3;"\ + "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm1,%%xmm2,%%xmm1; leaq (%3,%4,2),%3;"\ + "vperm2f128 $2,%%ymm0,%%ymm1,%%ymm"#co1";" + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define save_c_m8n4(c1,c2,c3,c4)\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhpd %%ymm"#c4",%%ymm"#c3",%%ymm3;"\ + "vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"\ + "vmovups %%ymm"#c3",(%3); vmovups %%ymm"#c4",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m4n4(c1,c2)\ + "vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vmovups %%xmm0,(%3); vmovups %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractf128 $1,%%ymm0,(%3); vextractf128 $1,%%ymm1,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m2n4(c1)\ + "vextractf128 $1,%%ymm"#c1",%%xmm1; vmovsd %%xmm"#c1",(%3); vmovhpd %%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vmovsd %%xmm1,(%3); vmovhpd %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define save_c_m1n4(c1)\ + "vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SOLVE_up_m2n4(a_off,c1)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_up_m2n8(a_off,c1,c2)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_up_m2n12(a_off,c1,c2,c3)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2; vmovsldup %%ymm"#c3",%%ymm3;" + +#define SOLVE_uplo_m2n4(a_off,c1) SOLVE_up_m2n4(a_off,c1)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_uplo_m2n8(a_off,c1,c2) SOLVE_up_m2n8(a_off,c1,c2)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_uplo_m2n12(a_off,c1,c2,c3) SOLVE_up_m2n12(a_off,c1,c2,c3)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define SOLVE_lo_m2n4(a_off,c1)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_lo_m2n8(a_off,c1,c2)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_lo_m2n12(a_off,c1,c2,c3)\ + "vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2; vmovshdup %%ymm"#c3",%%ymm3;" + +#define SOLVE_loup_m2n4(a_off,c1) SOLVE_lo_m2n4(a_off,c1)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_loup_m2n8(a_off,c1,c2) SOLVE_lo_m2n8(a_off,c1,c2)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_loup_m2n12(a_off,c1,c2,c3) SOLVE_lo_m2n12(a_off,c1,c2,c3)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define SOLVE_m1n4(a_off,c1) "vbroadcastss "#a_off"(%0),%%xmm0; vmulps %%xmm0,%%xmm"#c1",%%xmm"#c1";" +#define SOLVE_m1n8(a_off,c1,c2) SOLVE_m1n4(a_off,c1) "vmulps %%xmm0,%%xmm"#c2",%%xmm"#c2";" +#define SOLVE_m1n12(a_off,c1,c2,c3) SOLVE_m1n8(a_off,c1,c2) "vmulps %%xmm0,%%xmm"#c3",%%xmm"#c3";" + +#define SUBTRACT_m2n4(a_off,c1) "vbroadcastsd "#a_off"(%0),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" +#define SUBTRACT_m2n8(a_off,c1,c2) SUBTRACT_m2n4(a_off,c1) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" +#define SUBTRACT_m2n12(a_off,c1,c2,c3) SUBTRACT_m2n8(a_off,c1,c2) "vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";" + +#define save_b_m2n4(c1,tmp,b_off,...)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#tmp"; vpermpd $216,%%ymm"#tmp",%%ymm"#tmp"; vmovups %%ymm"#tmp","#b_off"("#__VA_ARGS__");" + +#define SAVE_b_m2n4(b_off,c1) save_b_m2n4(c1,1,b_off,%1) +#define SAVE_b_m2n8(b_off,c1,c2) SAVE_b_m2n4(b_off,c1) save_b_m2n4(c2,2,b_off,%1,%%r12,4) +#define SAVE_b_m2n12(b_off,c1,c2,c3) SAVE_b_m2n8(b_off,c1,c2) save_b_m2n4(c3,3,b_off,%1,%%r12,8) + +#define SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c1","#b_off"(%1);" +#define SAVE_b_m1n8(b_off,c1,c2) SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c2","#b_off"(%1,%%r12,4);" +#define SAVE_b_m1n12(b_off,c1,c2,c3) SAVE_b_m1n8(b_off,c1,c2) "vmovups %%xmm"#c3","#b_off"(%1,%%r12,8);" + diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c new file mode 100644 index 000000000..4e2cd4fe6 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RN.c @@ -0,0 +1,279 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RN_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1)\ + SOLVE_ri_m8n2(56,6,7,%1)\ + SAVE_SOLUTION_m8n2(6,7,64) + +#define SOLVE_RN_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ + SAVE_SOLUTION_m8n2(10,11,192) + +#define SOLVE_RN_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ + SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(4,5,0)\ + SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(6,7,64)\ + SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(8,9,128)\ + SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(10,11,192)\ + SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(12,13,256)\ + SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ + SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ + SAVE_SOLUTION_m8n2(14,15,320) + +#define SOLVE_RN_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1)\ + SOLVE_ri_m4n2(56,5,%1)\ + SAVE_SOLUTION_m4n2(5,32) + +#define SOLVE_RN_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ + SAVE_SOLUTION_m4n2(7,96) + +#define SOLVE_RN_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ + SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(4,0)\ + SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(5,32)\ + SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(6,64)\ + SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(7,96)\ + SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(8,128)\ + SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ + SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ + SAVE_SOLUTION_m4n2(9,160) + +#define SOLVE_RN_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,0) + +#define SOLVE_RN_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ + SAVE_SOLUTION_m2n4(6,7,32) + +#define SOLVE_RN_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ + SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(4,5,0)\ + SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(6,7,32)\ + SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ + SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ + SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ + SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ + SAVE_SOLUTION_m2n4(8,9,64) + +#define SOLVE_RN_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1)\ + SOLVE_col2_ltor_m1n4(16,4,%1)\ + SOLVE_col3_ltor_m1n4(32,4,%1)\ + SOLVE_col4_ltor_m1n4(48,4,%1)\ + SAVE_SOLUTION_m1n4(4,0) + +#define SOLVE_RN_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ + SAVE_SOLUTION_m1n4(5,16) + +#define SOLVE_RN_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ + SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(4,0)\ + SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(5,16)\ + SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ + SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ + SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ + SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ + SAVE_SOLUTION_m1n4(6,32) + +#define GEMM_RN_SIMPLE(mdim,ndim) \ + "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) +#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) +#define GEMM_RN_m8n12 \ + "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) +#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) +#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) +#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) +#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) +#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) +#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) +#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) +#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ +} + +static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT a0, b0; + int i, j, k; + for (i=0; i7;m_count-=8){ + if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); + solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; + BLASLONG n_count = n; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} + if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c new file mode 100644 index 000000000..ffcbfbbf0 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_RT.c @@ -0,0 +1,281 @@ +#include "common.h" +#include +#include "strsm_kernel_8x4_haswell_R_common.h" + +#define SOLVE_RT_m8n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-128) + +#define SOLVE_RT_m8n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-256) + +#define SOLVE_RT_m8n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ + SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ + SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ + SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ + SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ + SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ + SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ + SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ + SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ + SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ + SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ + SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m8n2(-176,4,5,%1)\ + SOLVE_le_m8n2(-192,4,5,%1)\ + SAVE_SOLUTION_m8n2(4,5,-384) + +#define SOLVE_RT_m4n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(4,-64) + +#define SOLVE_RT_m4n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(4,-128) + +#define SOLVE_RT_m4n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ + SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ + SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ + SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ + SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ + SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ + SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ + SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ + SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ + SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ + SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ + SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ + SOLVE_rile_m4n2(-176,4,%1)\ + SOLVE_le_m4n2(-192,4,%1)\ + SAVE_SOLUTION_m4n2(4,-192) + +#define SOLVE_RT_m2n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-32) + +#define SOLVE_RT_m2n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-64) + +#define SOLVE_RT_m2n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ + SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ + SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ + SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ + SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ + SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ + SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ + SAVE_SOLUTION_m2n4(4,5,-96) + +#define SOLVE_RT_m1n4 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(4,-16) + +#define SOLVE_RT_m1n8 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(4,-32) + +#define SOLVE_RT_m1n12 \ + "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ + SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ + SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ + SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ + SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ + SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ + SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ + SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ + SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ + SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ + SOLVE_col4_rtol_m1n4(-144,4,%1)\ + SOLVE_col3_rtol_m1n4(-160,4,%1)\ + SOLVE_col2_rtol_m1n4(-176,4,%1)\ + SOLVE_col1_rtol_m1n4(-192,4,%1)\ + SAVE_SOLUTION_m1n4(4,-48) + +/* r14 = b_tail, r15 = a_tail, r13 = k-kk */ +#define GEMM_RT_SIMPLE(mdim,ndim) \ + "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ + "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ + "1"#mdim""#ndim"1:\n\t"\ + "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ + "1"#mdim""#ndim"2:\n\t" +#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) +#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) +#define GEMM_RT_m8n12 \ + "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ + "cmpq $8,%5; jb 18122f;"\ + "18121:\n\t"\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ + "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ + "18122:\n\t"\ + "testq %5,%5; jz 18124f;"\ + "18123:\n\t"\ + "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ + "18124:\n\t" +#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) +#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) +#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) +#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) +#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) +#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) +#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) +#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) +#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) + +#define COMPUTE(ndim) {\ + b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ + __asm__ __volatile__(\ + "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ + "cmpq $8,%%r11; jb "#ndim"772f;"\ + #ndim"771:\n\t"\ + GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ + #ndim"772:\n\t"\ + "testq $4,%%r11; jz "#ndim"773f;"\ + GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ + #ndim"773:\n\t"\ + "testq $2,%%r11; jz "#ndim"774f;"\ + GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ + #ndim"774:\n\t"\ + "testq $1,%%r11; jz "#ndim"775f;"\ + GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ + #ndim"775:\n\t"\ + "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ + :"r11","r12","r13","r14","r15","cc","memory",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ +} + +static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ + FLOAT a0, b0; + int i, j, k; + for (i=n-1;i>=0;i--) { + b0 = b[i*n+i]; + for (j=0;j7;m_count-=8){ + if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); + solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 8; c_ptr += 8; + } + for(;m_count>3;m_count-=4){ + if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); + solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 4; c_ptr += 4; + } + for(;m_count>1;m_count-=2){ + if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); + solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 2; c_ptr += 2; + } + if(m_count>0){ + if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); + solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); + a_ptr += k * 1; c_ptr += 1; + } +} +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ + float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; + float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; + float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; + uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; + BLASLONG n_count = n; + if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} + if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + return 0; +} diff --git a/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h new file mode 100644 index 000000000..36b7aa1a3 --- /dev/null +++ b/kernel/x86_64/strsm_kernel_8x4_haswell_R_common.h @@ -0,0 +1,226 @@ +/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ +/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ +/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ + +#define init_m8n4(c1,c2,c3,c4)\ + "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ + "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" +#define INIT_m8n4 init_m8n4(4,5,6,7) +#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) +#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) + +#define init_m4n4(c1,c2,c3,c4)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ + "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" +#define INIT_m4n4 init_m4n4(4,5,6,7) +#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) +#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) + +#define init_m2n4(c1,c2)\ + "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" +#define INIT_m2n4 init_m2n4(4,5) +#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) +#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) + +#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" +#define INIT_m1n4 init_m1n4(4) +#define INIT_m1n8 INIT_m1n4 init_m1n4(5) +#define INIT_m1n12 INIT_m1n8 init_m1n4(6) + +#define GEMM_KERNEL_k1m8n4 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ + "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ + "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" +#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ + "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ + "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" +#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ + "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ + "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" + +#define GEMM_KERNEL_k1m4n4 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ + "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ + "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ + "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" +#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ + "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ + "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" + +#define GEMM_KERNEL_k1m2n4 \ + "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ + "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" +#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ + "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" +#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ + "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" + +#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" +#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" +#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" + +#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ + "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ + "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" + +#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ + "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ + "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ + "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ + "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ + "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" + +#define GEMM_SUM_REORDER_2x4(c1,c2)\ + "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ + "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ + "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ + +#define GEMM_SUM_REORDER_1x4(c1)\ + "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ + "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" + +#define SOLVE_le_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovsldup %%ymm"#c1",%%ymm1;" + +#define SOLVE_le_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" + +#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_ri_m4n2(b_off,c1,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ + "vmovshdup %%ymm"#c1",%%ymm1;" + +#define SOLVE_ri_m8n2(b_off,c1,c2,...)\ + "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ + "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ + "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" + +#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_mul_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1;" + +#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ + "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ + "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ + "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" + +#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ + "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" + +#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" + +#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" + +#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" + +#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ + "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ + "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ + "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m4n2(c1,a_off)\ + "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ + "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ + "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" + +#define SAVE_SOLUTION_m1n4(c1,a_off)\ + "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ + "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" From cdc0e9011ea9911edb0027a207c67beeb0afca54 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Mon, 16 Mar 2020 16:39:37 +0000 Subject: [PATCH 18/19] Update KERNEL.ZEN --- kernel/x86_64/KERNEL.ZEN | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index 1cd02db74..7bb308fea 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -74,10 +74,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c +STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c +STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c +STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c From 62b96089864a1951561671704c0de3e40bc7b477 Mon Sep 17 00:00:00 2001 From: wjc404 <52632443+wjc404@users.noreply.github.com> Date: Tue, 17 Mar 2020 12:52:55 +0800 Subject: [PATCH 19/19] Update KERNEL.SKYLAKEX --- kernel/x86_64/KERNEL.SKYLAKEX | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 9b3c83e42..333571fd4 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -6,6 +6,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c