Merge pull request #38 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-03-20 01:05:22 +01:00 committed by GitHub
commit c0da205412
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
23 changed files with 2737 additions and 36 deletions

View File

@ -635,6 +635,16 @@ endif
ifeq ($(ARCH), arm64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
ifeq ($(F_COMPILER), GFORTRAN)
FCOMMON_OPT += -fdefault-integer-8
endif
ifeq ($(F_COMPILER), FLANG)
FCOMMON_OPT += -i8
endif
endif
endif
endif

View File

@ -78,8 +78,13 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
cher.goto zher.goto \
cher2.goto zher2.goto \
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
sspmv.goto dspmv.goto \
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
@ -115,8 +120,12 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
chemm.acml zhemm.acml \
cherk.acml zherk.acml \
cher2k.acml zher2k.acml \
cher.acml zher.acml \
cher2.acml zher2.acml \
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
@ -153,8 +162,13 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
chemm.atlas zhemm.atlas \
cherk.atlas zherk.atlas \
cher2k.atlas zher2k.atlas \
cher.atlas zher.atlas \
cher2.atlas zher2.atlas \
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
sspmv.atlas dspmv.atlas \
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
@ -190,8 +204,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
chemm.mkl zhemm.mkl \
cherk.mkl zherk.mkl \
cher2k.mkl zher2k.mkl \
cher.mkl zher.mkl \
cher2.mkl zher2.mkl \
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
@ -227,8 +245,13 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
chemm.goto zhemm.goto \
cherk.goto zherk.goto \
cher2k.goto zher2k.goto \
cher.goto zher.goto \
cher2.goto zher2.goto \
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
sspmv.goto dspmv.goto \
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
smallscaling \
@ -270,8 +293,12 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
chemm.acml zhemm.acml \
cherk.acml zherk.acml \
cher2k.acml zher2k.acml \
cher.acml zher.acml \
cher2.acml zher2.acml \
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
@ -308,8 +335,13 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
chemm.atlas zhemm.atlas \
cherk.atlas zherk.atlas \
cher2k.atlas zher2k.atlas \
cher.atlas zher.atlas \
cher2.atlas zher2.atlas \
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
sspmv.atlas dspmv.atlas \
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
@ -347,8 +379,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
chemm.mkl zhemm.mkl \
cherk.mkl zherk.mkl \
cher2k.mkl zher2k.mkl \
cher.mkl zher.mkl \
cher2.mkl zher2.mkl \
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
@ -395,8 +431,12 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
chemm.veclib zhemm.veclib \
cherk.veclib zherk.veclib \
cher2k.veclib zher2k.veclib \
cher.veclib zher.veclib \
cher2.veclib zher2.veclib \
sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \
strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \
stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \
stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \
strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \
sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \
sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \
@ -1231,6 +1271,74 @@ zher2k.mkl : zher2k.$(SUFFIX)
zher2k.veclib : zher2k.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Cher ####################################################
cher.goto : cher.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cher.acml : cher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher.atlas : cher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher.mkl : cher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher.veclib : cher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zher ####################################################
zher.goto : zher.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zher.acml : zher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher.atlas : zher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher.mkl : zher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher.veclib : zher.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Cher2 ####################################################
cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
cher2.acml : cher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher2.atlas : cher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher2.mkl : cher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
cher2.veclib : cher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Zher2 ####################################################
zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
zher2.acml : zher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher2.atlas : zher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher2.mkl : zher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
zher2.veclib : zher2.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgemv ####################################################
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -1297,6 +1405,20 @@ zgemv.mkl : zgemv.$(SUFFIX)
zgemv.veclib : zgemv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sspmv ####################################################
sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
sspmv.atlas : sspmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dspmv ####################################################
dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dspmv.atlas : dspmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Strmv ####################################################
strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -1363,6 +1485,139 @@ ztrmv.mkl : ztrmv.$(SUFFIX)
ztrmv.veclib : ztrmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Stpmv ####################################################
stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
stpmv.acml : stpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpmv.atlas : stpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpmv.mkl : stpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpmv.veclib : stpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dtpmv ####################################################
dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dtpmv.acml : dtpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpmv.atlas : dtpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpmv.mkl : dtpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpmv.veclib : dtpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ctpmv ####################################################
ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
ctpmv.acml : ctpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpmv.atlas : ctpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpmv.mkl : ctpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpmv.veclib : ctpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ztpmv ####################################################
ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
ztpmv.acml : ztpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpmv.atlas : ztpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpmv.mkl : ztpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpmv.veclib : ztpmv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Stpsv ####################################################
stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
stpsv.acml : stpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpsv.atlas : stpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpsv.mkl : stpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
stpsv.veclib : stpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Dtpsv ####################################################
dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
dtpsv.acml : dtpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpsv.atlas : dtpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpsv.mkl : dtpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
dtpsv.veclib : dtpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ctpsv ####################################################
ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
ctpsv.acml : ctpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpsv.atlas : ctpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpsv.mkl : ctpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ctpsv.veclib : ctpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Ztpsv ####################################################
ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
ztpsv.acml : ztpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpsv.atlas : ztpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpsv.mkl : ztpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
ztpsv.veclib : ztpsv.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Strsv ####################################################
strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
@ -2775,6 +3030,18 @@ cher2k.$(SUFFIX) : her2k.c
zher2k.$(SUFFIX) : her2k.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
cher.$(SUFFIX) : her.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zher.$(SUFFIX) : her.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
cher2.$(SUFFIX) : her2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
zher2.$(SUFFIX) : her2.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@ -2787,6 +3054,12 @@ cgemv.$(SUFFIX) : gemv.c
zgemv.$(SUFFIX) : gemv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
sspmv.$(SUFFIX) : spmv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dspmv.$(SUFFIX) : spmv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
strmv.$(SUFFIX) : trmv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@ -2799,6 +3072,30 @@ ctrmv.$(SUFFIX) : trmv.c
ztrmv.$(SUFFIX) : trmv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
stpmv.$(SUFFIX) : tpmv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dtpmv.$(SUFFIX) : tpmv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
ctpmv.$(SUFFIX) : tpmv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
ztpmv.$(SUFFIX) : tpmv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
stpsv.$(SUFFIX) : tpsv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
dtpsv.$(SUFFIX) : tpsv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
ctpsv.$(SUFFIX) : tpsv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
ztpsv.$(SUFFIX) : tpsv.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
strsv.$(SUFFIX) : trsv.c
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
@ -3124,4 +3421,3 @@ clean ::
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
include $(TOPDIR)/Makefile.tail

186
benchmark/her.c Normal file
View File

@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER
#ifdef DOUBLE
#define HER BLASFUNC(zher)
#else
#define HER BLASFUNC(cher)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x;
FLOAT alpha[] = {1.0, 1.0};
blasint incx = 1;
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HER (&uplo, &m, alpha, x, &incx, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}

190
benchmark/her2.c Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER2
#ifdef DOUBLE
#define HER2 BLASFUNC(zher2)
#else
#define HER2 BLASFUNC(cher2)
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
blasint inc = 1;
char *p;
char uplo='U';
char trans='N';
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
blasint m, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
fprintf(stderr, " %6d : ", (int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
}
return 0;
}

View File

@ -181,7 +181,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops %10.6f sec\n",
" %10.2f MBytes %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
}

219
benchmark/spmv.c Normal file
View File

@ -0,0 +1,219 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPMV
#ifndef COMPLEX
#ifdef DOUBLE
#define SPMV BLASFUNC(dspmv)
#else
#define SPMV BLASFUNC(sspmv)
#endif
#else
#ifdef DOUBLE
#define SPMV BLASFUNC(zspmv)
#else
#define SPMV BLASFUNC(cspmv)
#endif
#endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *a, *x, *y;
FLOAT alpha[] = {1.0, 1.0};
FLOAT beta [] = {1.0, 1.0};
char uplo='L';
blasint m, i, j;
blasint inc_x=1,inc_y=1;
int loops = 1;
int l;
char *p;
int from = 1;
int to = 200;
int step = 1;
struct timeval start, stop;
double time1,timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step)
{
timeg=0;
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
for(j = 0; j < m; j++){
for(i = 0; i < m * COMPSIZE; i++){
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (l=0; l<loops; l++)
{
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
}
timeg /= loops;
fprintf(stderr,
" %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -187,7 +187,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes %10.6f sec\n",
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg);
}

View File

@ -196,7 +196,7 @@ int main(int argc, char *argv[]){
timeg /= loops;
fprintf(stderr,
" %10.2f MBytes %10.6f sec\n",
" %10.2f MFlops %10.6f sec\n",
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg);
}

172
benchmark/tpmv.c Normal file
View File

@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPMV
#ifndef COMPLEX
#ifdef DOUBLE
#define TPMV BLASFUNC(dtpmv)
#else
#define TPMV BLASFUNC(stpmv)
#endif
#else
#ifdef DOUBLE
#define TPMV BLASFUNC(ztpmv)
#else
#define TPMV BLASFUNC(ctpmv)
#endif
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *a, *x;
char *p;
char uplo ='U';
char trans='N';
char diag ='U';
int loops = 1;
int l;
blasint inc_x=1;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint n, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
to, step, uplo, trans, diag, loops, inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(n = from; n <= to; n += step) {
timeg=0;
fprintf(stderr, " %6d : ", (int)n);
for(j = 0; j < n; j++) {
for(i = 0; i < n * COMPSIZE; i++) {
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start);
TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

172
benchmark/tpsv.c Normal file
View File

@ -0,0 +1,172 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stdio.h>
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPSV
#ifndef COMPLEX
#ifdef DOUBLE
#define TPSV BLASFUNC(dtpsv)
#else
#define TPSV BLASFUNC(stpsv)
#endif
#else
#ifdef DOUBLE
#define TPSV BLASFUNC(ztpsv)
#else
#define TPSV BLASFUNC(ctpsv)
#endif
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[])
{
FLOAT *a, *x;
char *p;
char uplo ='U';
char trans='N';
char diag ='U';
int loops = 1;
int l;
blasint inc_x=1;
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
blasint n, i, j;
int from = 1;
int to = 200;
int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg;
argc--;argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;}
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
if (argc > 0) { step = atol(*argv); argc--; argv++;}
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
to, step, uplo, trans, diag, loops, inc_x);
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
fprintf(stderr,"Out of Memory!!\n");exit(1);
}
#ifdef linux
srandom(getpid());
#endif
fprintf(stderr, " SIZE Flops\n");
for(n = from; n <= to; n += step) {
timeg=0;
fprintf(stderr, " %6d : ", (int)n);
for(j = 0; j < n; j++) {
for(i = 0; i < n * COMPSIZE; i++) {
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
}
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start);
TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
timeg += time1;
}
timeg /= loops;
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
}
return 0;
}
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));

View File

@ -40,8 +40,11 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
ifeq ($(NOFORTRAN),1)
all ::
else
all :: all1 all2 all3
endif
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
ifndef CROSS

View File

@ -77,10 +77,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c
STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c
STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c
STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c

View File

@ -6,6 +6,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c
DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c

View File

@ -74,10 +74,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c
STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c
STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c
STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c

View File

@ -0,0 +1,240 @@
#include "common.h"
#include <stdint.h>
#include "strsm_kernel_8x4_haswell_L_common.h"
#define SOLVE_LN_m1n4 \
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\
SOLVE_m1n4(-4,4) SAVE_b_m1n4(-16,4)\
"movq %2,%3;" save_c_m1n4(4)
#define SOLVE_LN_m1n8 \
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\
SOLVE_m1n8(-4,4,5) SAVE_b_m1n8(-16,4,5)\
"movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5)
#define SOLVE_LN_m1n12 \
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\
SOLVE_m1n12(-4,4,5,6) SAVE_b_m1n12(-16,4,5,6)\
"movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6)
#define SOLVE_LN_m2n4 \
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\
SOLVE_loup_m2n4(-8,4)\
SOLVE_up_m2n4(-16,4) SAVE_b_m2n4(-32,4)\
"movq %2,%3;" save_c_m2n4(4)
#define SOLVE_LN_m2n8 \
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\
SOLVE_loup_m2n8(-8,4,5)\
SOLVE_up_m2n8(-16,4,5) SAVE_b_m2n8(-32,4,5)\
"movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5)
#define SOLVE_LN_m2n12 \
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\
SOLVE_loup_m2n12(-8,4,5,6)\
SOLVE_up_m2n12(-16,4,5,6) SAVE_b_m2n12(-32,4,5,6)\
"movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6)
#define SOLVE_LN_m4n4 \
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\
\
SOLVE_loup_m2n4(-8,5) SUBTRACT_m2n4(-16,4)\
SOLVE_up_m2n4(-24,5) SUBTRACT_m2n4(-32,4) SAVE_b_m2n4(-32,5)\
\
SOLVE_loup_m2n4(-48,4)\
SOLVE_up_m2n4(-64,4) SAVE_b_m2n4(-64,4)\
\
"movq %2,%3;" save_c_m4n4(4,5)
#define SOLVE_LN_m4n8 \
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\
\
SOLVE_loup_m2n8(-8,5,7) SUBTRACT_m2n8(-16,4,6)\
SOLVE_up_m2n8(-24,5,7) SUBTRACT_m2n8(-32,4,6) SAVE_b_m2n8(-32,5,7)\
\
SOLVE_loup_m2n8(-48,4,6)\
SOLVE_up_m2n8(-64,4,6) SAVE_b_m2n8(-64,4,6)\
\
"movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7)
#define SOLVE_LN_m4n12 \
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\
\
SOLVE_loup_m2n12(-8,5,7,9) SUBTRACT_m2n12(-16,4,6,8)\
SOLVE_up_m2n12(-24,5,7,9) SUBTRACT_m2n12(-32,4,6,8) SAVE_b_m2n12(-32,5,7,9)\
\
SOLVE_loup_m2n12(-48,4,6,8)\
SOLVE_up_m2n12(-64,4,6,8) SAVE_b_m2n12(-64,4,6,8)\
\
"movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9)
#define SOLVE_LN_m8n4 \
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32)\
\
SOLVE_loup_m2n4(-8,7) SUBTRACT_m2n4(-16,6) SUBTRACT_m2n4(-24,5) SUBTRACT_m2n4(-32,4)\
SOLVE_up_m2n4(-40,7) SUBTRACT_m2n4(-48,6) SUBTRACT_m2n4(-56,5) SUBTRACT_m2n4(-64,4) SAVE_b_m2n4(-32,7)\
\
SOLVE_loup_m2n4(-80,6) SUBTRACT_m2n4(-88,5) SUBTRACT_m2n4(-96,4)\
SOLVE_up_m2n4(-112,6) SUBTRACT_m2n4(-120,5) SUBTRACT_m2n4(-128,4) SAVE_b_m2n4(-64,6)\
\
SOLVE_loup_m2n4(-152,5) SUBTRACT_m2n4(-160,4)\
SOLVE_up_m2n4(-184,5) SUBTRACT_m2n4(-192,4) SAVE_b_m2n4(-96,5)\
\
SOLVE_loup_m2n4(-224,4)\
SOLVE_up_m2n4(-256,4) SAVE_b_m2n4(-128,4)\
\
"movq %2,%3;" save_c_m8n4(4,5,6,7)
#define SOLVE_LN_m8n8 \
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32)\
\
SOLVE_loup_m2n8(-8,7,11) SUBTRACT_m2n8(-16,6,10) SUBTRACT_m2n8(-24,5,9) SUBTRACT_m2n8(-32,4,8)\
SOLVE_up_m2n8(-40,7,11) SUBTRACT_m2n8(-48,6,10) SUBTRACT_m2n8(-56,5,9) SUBTRACT_m2n8(-64,4,8) SAVE_b_m2n8(-32,7,11)\
\
SOLVE_loup_m2n8(-80,6,10) SUBTRACT_m2n8(-88,5,9) SUBTRACT_m2n8(-96,4,8)\
SOLVE_up_m2n8(-112,6,10) SUBTRACT_m2n8(-120,5,9) SUBTRACT_m2n8(-128,4,8) SAVE_b_m2n8(-64,6,10)\
\
SOLVE_loup_m2n8(-152,5,9) SUBTRACT_m2n8(-160,4,8)\
SOLVE_up_m2n8(-184,5,9) SUBTRACT_m2n8(-192,4,8) SAVE_b_m2n8(-96,5,9)\
\
SOLVE_loup_m2n8(-224,4,8)\
SOLVE_up_m2n8(-256,4,8) SAVE_b_m2n8(-128,4,8)\
\
"movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11)
#define SOLVE_LN_m8n12 \
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32) GEMM_SUM_REORDER_8x4(12,13,14,15,-32)\
\
SOLVE_loup_m2n12(-8,7,11,15) SUBTRACT_m2n12(-16,6,10,14) SUBTRACT_m2n12(-24,5,9,13) SUBTRACT_m2n12(-32,4,8,12)\
SOLVE_up_m2n12(-40,7,11,15) SUBTRACT_m2n12(-48,6,10,14) SUBTRACT_m2n12(-56,5,9,13) SUBTRACT_m2n12(-64,4,8,12) SAVE_b_m2n12(-32,7,11,15)\
\
SOLVE_loup_m2n12(-80,6,10,14) SUBTRACT_m2n12(-88,5,9,13) SUBTRACT_m2n12(-96,4,8,12)\
SOLVE_up_m2n12(-112,6,10,14) SUBTRACT_m2n12(-120,5,9,13) SUBTRACT_m2n12(-128,4,8,12) SAVE_b_m2n12(-64,6,10,14)\
\
SOLVE_loup_m2n12(-152,5,9,13) SUBTRACT_m2n12(-160,4,8,12)\
SOLVE_up_m2n12(-184,5,9,13) SUBTRACT_m2n12(-192,4,8,12) SAVE_b_m2n12(-96,5,9,13)\
\
SOLVE_loup_m2n12(-224,4,8,12)\
SOLVE_up_m2n12(-256,4,8,12) SAVE_b_m2n12(-128,4,8,12)\
\
"movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15)
/* r13 = k-kk, r14 = b_tail, r15 = a_tail */
#define GEMM_LN_SIMPLE(mdim,ndim) \
"movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,"#mdim"),%%r15; negq %%r12;"\
"movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\
"testq %5,%5; jz 2"#mdim""#ndim"2f;"\
"2"#mdim""#ndim"1:\n\t"\
"subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 2"#mdim""#ndim"1b;"\
"2"#mdim""#ndim"2:\n\t"
#define GEMM_LN_m8n4 GEMM_LN_SIMPLE(8,4)
#define GEMM_LN_m8n8 GEMM_LN_SIMPLE(8,8)
#define GEMM_LN_m8n12 \
"movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,8),%%r15; negq %%r12; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\
"cmpq $8,%5; jb 28122f;"\
"28121:\n\t"\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $8,%5; cmpq $8,%5; jnb 28121b;"\
"28122:\n\t"\
"testq %5,%5; jz 28124f;"\
"28123:\n\t"\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 28123b;"\
"28124:\n\t"
#define GEMM_LN_m4n4 GEMM_LN_SIMPLE(4,4)
#define GEMM_LN_m4n8 GEMM_LN_SIMPLE(4,8)
#define GEMM_LN_m4n12 GEMM_LN_SIMPLE(4,12)
#define GEMM_LN_m2n4 GEMM_LN_SIMPLE(2,4)
#define GEMM_LN_m2n8 GEMM_LN_SIMPLE(2,8)
#define GEMM_LN_m2n12 GEMM_LN_SIMPLE(2,12)
#define GEMM_LN_m1n4 GEMM_LN_SIMPLE(1,4)
#define GEMM_LN_m1n8 GEMM_LN_SIMPLE(1,8)
#define GEMM_LN_m1n12 GEMM_LN_SIMPLE(1,12)
#define COMPUTE(ndim) {\
c_ptr += M;\
__asm__ __volatile__(\
"movq %0,%%r15; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; leaq (%1,%%r12,4),%%r14; movq %10,%%r11;"\
"testq $1,%%r11; jz "#ndim"772f;"\
#ndim"771:\n\t"\
GEMM_LN_m1n##ndim SOLVE_LN_m1n##ndim "subq $1,%%r11;"\
#ndim"772:\n\t"\
"testq $2,%%r11; jz "#ndim"773f;"\
GEMM_LN_m2n##ndim SOLVE_LN_m2n##ndim "subq $2,%%r11;"\
#ndim"773:\n\t"\
"testq $4,%%r11; jz "#ndim"774f;"\
GEMM_LN_m4n##ndim SOLVE_LN_m4n##ndim "subq $4,%%r11;"\
#ndim"774:\n\t"\
"testq %%r11,%%r11; jz "#ndim"776f;"\
#ndim"775:\n\t"\
GEMM_LN_m8n##ndim SOLVE_LN_m8n##ndim "subq $8,%%r11; jnz "#ndim"775b;"\
#ndim"776:\n\t"\
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(kmkkinp),"m"(one[0]),"m"(zero[0]),"m"(M)\
:"r11","r12","r13","r14","r15","cc","memory",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr += M * K; b_ptr += (ndim-4) * K; c_ptr += ldc * ndim;\
}
static void solve_LN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT a0, b0;
int i, j, k;
for (i=m-1;i>=0;i--) {
a0 = a[i*m+i]; //reciprocal of the original value
for (j=0;j<n;j++) {
b0 = c[j*ldc+i]*a0;
c[j*ldc+i] = b[i*n+j] = b0;
for (k=0;k<i;k++) c[j*ldc+k] -= b0*a[i*m+k];
}
}
}
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
BLASLONG m_count = m, kk = m+offset; FLOAT *a_ptr = sa+m*k, *c_ptr = C+m;
if(m_count&1){
a_ptr-=k; c_ptr--;
if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
solve_LN(1,n,a_ptr+(kk-1)*1,sb+(kk-1)*n,c_ptr,ldc);
kk -= 1;
m_count--;
}
if(m_count&2){
a_ptr-=k*2; c_ptr-=2;
if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
solve_LN(2,n,a_ptr+(kk-2)*2,sb+(kk-2)*n,c_ptr,ldc);
kk -= 2;
m_count-=2;
}
if(m_count&4){
a_ptr-=k*4; c_ptr-=4;
if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
solve_LN(4,n,a_ptr+(kk-4)*4,sb+(kk-4)*n,c_ptr,ldc);
kk -= 4;
m_count-=4;
}
for(;m_count>7;m_count-=8){
a_ptr-=k*8; c_ptr-=8;
if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
solve_LN(8,n,a_ptr+(kk-8)*8,sb+(kk-8)*n,c_ptr,ldc);
kk -= 8;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
float *a_ptr = sa+m*k, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, kmkkinp = (uint64_t)(k-m-offset), k_cnt = 0;
BLASLONG n_count = n;
for(;n_count>11;n_count-=12) COMPUTE(12)
for(;n_count>7;n_count-=8) COMPUTE(8)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,sa,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;}
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,sa,b_ptr,c_ptr,ldc,k,offset);
return 0;
}

View File

@ -0,0 +1,228 @@
#include "common.h"
#include <stdint.h>
#include "strsm_kernel_8x4_haswell_L_common.h"
#define SOLVE_LT_m1n4 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\
SOLVE_m1n4(0,4) SAVE_b_m1n4(0,4)\
"movq %2,%3; addq $4,%2;" save_c_m1n4(4)
#define SOLVE_LT_m1n8 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\
SOLVE_m1n8(0,4,5) SAVE_b_m1n8(0,4,5)\
"movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5)
#define SOLVE_LT_m1n12 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\
SOLVE_m1n12(0,4,5,6) SAVE_b_m1n12(0,4,5,6)\
"movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6)
#define SOLVE_LT_m2n4 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\
SOLVE_uplo_m2n4(0,4)\
SOLVE_lo_m2n4(8,4) SAVE_b_m2n4(0,4)\
"movq %2,%3; addq $8,%2;" save_c_m2n4(4)
#define SOLVE_LT_m2n8 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\
SOLVE_uplo_m2n8(0,4,5)\
SOLVE_lo_m2n8(8,4,5) SAVE_b_m2n8(0,4,5)\
"movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5)
#define SOLVE_LT_m2n12 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\
SOLVE_uplo_m2n12(0,4,5,6)\
SOLVE_lo_m2n12(8,4,5,6) SAVE_b_m2n12(0,4,5,6)\
"movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6)
#define SOLVE_LT_m4n4 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\
\
SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5)\
SOLVE_lo_m2n4(16,4) SUBTRACT_m2n4(24,5) SAVE_b_m2n4(0,4)\
\
SOLVE_uplo_m2n4(40,5)\
SOLVE_lo_m2n4(56,5) SAVE_b_m2n4(32,5)\
\
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5)
#define SOLVE_LT_m4n8 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\
\
SOLVE_uplo_m2n8(0,4,6) SUBTRACT_m2n8(8,5,7)\
SOLVE_lo_m2n8(16,4,6) SUBTRACT_m2n8(24,5,7) SAVE_b_m2n8(0,4,6)\
\
SOLVE_uplo_m2n8(40,5,7)\
SOLVE_lo_m2n8(56,5,7) SAVE_b_m2n8(32,5,7)\
\
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7)
#define SOLVE_LT_m4n12 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\
\
SOLVE_uplo_m2n12(0,4,6,8) SUBTRACT_m2n12(8,5,7,9)\
SOLVE_lo_m2n12(16,4,6,8) SUBTRACT_m2n12(24,5,7,9) SAVE_b_m2n12(0,4,6,8)\
\
SOLVE_uplo_m2n12(40,5,7,9)\
SOLVE_lo_m2n12(56,5,7,9) SAVE_b_m2n12(32,5,7,9)\
\
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9)
#define SOLVE_LT_m8n4 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63)\
\
SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5) SUBTRACT_m2n4(16,6) SUBTRACT_m2n4(24,7)\
SOLVE_lo_m2n4(32,4) SUBTRACT_m2n4(40,5) SUBTRACT_m2n4(48,6) SUBTRACT_m2n4(56,7) SAVE_b_m2n4(0,4)\
\
SOLVE_uplo_m2n4(72,5) SUBTRACT_m2n4(80,6) SUBTRACT_m2n4(88,7)\
SOLVE_lo_m2n4(104,5) SUBTRACT_m2n4(112,6) SUBTRACT_m2n4(120,7) SAVE_b_m2n4(32,5)\
\
SOLVE_uplo_m2n4(144,6) SUBTRACT_m2n4(152,7)\
SOLVE_lo_m2n4(176,6) SUBTRACT_m2n4(184,7) SAVE_b_m2n4(64,6)\
\
SOLVE_uplo_m2n4(216,7)\
SOLVE_lo_m2n4(248,7) SAVE_b_m2n4(96,7)\
\
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7)
#define SOLVE_LT_m8n8 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63)\
\
SOLVE_uplo_m2n8(0,4,8) SUBTRACT_m2n8(8,5,9) SUBTRACT_m2n8(16,6,10) SUBTRACT_m2n8(24,7,11)\
SOLVE_lo_m2n8(32,4,8) SUBTRACT_m2n8(40,5,9) SUBTRACT_m2n8(48,6,10) SUBTRACT_m2n8(56,7,11) SAVE_b_m2n8(0,4,8)\
\
SOLVE_uplo_m2n8(72,5,9) SUBTRACT_m2n8(80,6,10) SUBTRACT_m2n8(88,7,11)\
SOLVE_lo_m2n8(104,5,9) SUBTRACT_m2n8(112,6,10) SUBTRACT_m2n8(120,7,11) SAVE_b_m2n8(32,5,9)\
\
SOLVE_uplo_m2n8(144,6,10) SUBTRACT_m2n8(152,7,11)\
SOLVE_lo_m2n8(176,6,10) SUBTRACT_m2n8(184,7,11) SAVE_b_m2n8(64,6,10)\
\
SOLVE_uplo_m2n8(216,7,11)\
SOLVE_lo_m2n8(248,7,11) SAVE_b_m2n8(96,7,11)\
\
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11)
#define SOLVE_LT_m8n12 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63)\
\
SOLVE_uplo_m2n12(0,4,8,12) SUBTRACT_m2n12(8,5,9,13) SUBTRACT_m2n12(16,6,10,14) SUBTRACT_m2n12(24,7,11,15)\
SOLVE_lo_m2n12(32,4,8,12) SUBTRACT_m2n12(40,5,9,13) SUBTRACT_m2n12(48,6,10,14) SUBTRACT_m2n12(56,7,11,15) SAVE_b_m2n12(0,4,8,12)\
\
SOLVE_uplo_m2n12(72,5,9,13) SUBTRACT_m2n12(80,6,10,14) SUBTRACT_m2n12(88,7,11,15)\
SOLVE_lo_m2n12(104,5,9,13) SUBTRACT_m2n12(112,6,10,14) SUBTRACT_m2n12(120,7,11,15) SAVE_b_m2n12(32,5,9,13)\
\
SOLVE_uplo_m2n12(144,6,10,14) SUBTRACT_m2n12(152,7,11,15)\
SOLVE_lo_m2n12(176,6,10,14) SUBTRACT_m2n12(184,7,11,15) SAVE_b_m2n12(64,6,10,14)\
\
SOLVE_uplo_m2n12(216,7,11,15)\
SOLVE_lo_m2n12(248,7,11,15) SAVE_b_m2n12(96,7,11,15)\
\
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15)
#define GEMM_LT_SIMPLE(mdim,ndim) \
"movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
"1"#mdim""#ndim"1:\n\t"\
GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
"1"#mdim""#ndim"2:\n\t"
#define GEMM_LT_m8n4 GEMM_LT_SIMPLE(8,4)
#define GEMM_LT_m8n8 GEMM_LT_SIMPLE(8,8)
#define GEMM_LT_m8n12 \
"movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\
"cmpq $8,%5; jb 18122f;"\
"18121:\n\t"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
"18122:\n\t"\
"testq %5,%5; jz 18124f;"\
"18123:\n\t"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
"18124:\n\t"
#define GEMM_LT_m4n4 GEMM_LT_SIMPLE(4,4)
#define GEMM_LT_m4n8 GEMM_LT_SIMPLE(4,8)
#define GEMM_LT_m4n12 GEMM_LT_SIMPLE(4,12)
#define GEMM_LT_m2n4 GEMM_LT_SIMPLE(2,4)
#define GEMM_LT_m2n8 GEMM_LT_SIMPLE(2,8)
#define GEMM_LT_m2n12 GEMM_LT_SIMPLE(2,12)
#define GEMM_LT_m1n4 GEMM_LT_SIMPLE(1,4)
#define GEMM_LT_m1n8 GEMM_LT_SIMPLE(1,8)
#define GEMM_LT_m1n12 GEMM_LT_SIMPLE(1,12)
#define COMPUTE(ndim) {\
__asm__ __volatile__(\
"movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
"cmpq $8,%%r11; jb "#ndim"772f;"\
#ndim"771:\n\t"\
GEMM_LT_m8n##ndim SOLVE_LT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
#ndim"772:\n\t"\
"testq $4,%%r11; jz "#ndim"773f;"\
GEMM_LT_m4n##ndim SOLVE_LT_m4n##ndim "subq $4,%%r11;"\
#ndim"773:\n\t"\
"testq $2,%%r11; jz "#ndim"774f;"\
GEMM_LT_m2n##ndim SOLVE_LT_m2n##ndim "subq $2,%%r11;"\
#ndim"774:\n\t"\
"testq $1,%%r11; jz "#ndim"775f;"\
GEMM_LT_m1n##ndim SOLVE_LT_m1n##ndim "subq $1,%%r11;"\
#ndim"775:\n\t"\
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
:"r11","r12","r13","r14","r15","cc","memory",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M;\
}
static void solve_LT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT a0, b0;
int i, j, k;
for (i=0;i<m;i++) {
a0 = a[i*m+i];
for (j=0;j<n;j++) {
b0 = c[j*ldc+i] * a0;
b[i*n+j] = c[j*ldc+i] = b0;
for (k=i+1;k<m;k++) c[j*ldc+k] -= b0 * a[i*m+k];
}
}
}
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
for(;m_count>7;m_count-=8){
if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_LT(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
kk += 8; a_ptr += k * 8; c_ptr += 8;
}
for(;m_count>3;m_count-=4){
if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_LT(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
kk += 4; a_ptr += k * 4; c_ptr += 4;
}
for(;m_count>1;m_count-=2){
if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_LT(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
kk += 2; a_ptr += k * 2; c_ptr += 2;
}
if(m_count>0){
if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_LT(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
kk += 1; a_ptr += k * 1; c_ptr += 1;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)offset, k_cnt = 0;
BLASLONG n_count = n;
for(;n_count>11;n_count-=12) COMPUTE(12)
for(;n_count>7;n_count-=8) COMPUTE(8)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;}
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,offset);
return 0;
}

View File

@ -0,0 +1,187 @@
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
#define init_m8n4(c1,c2,c3,c4)\
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2"; vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
#define INIT_m8n4 init_m8n4(4,5,6,7)
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
#define init_m4n4(c1,c2,c3,c4)\
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2"; vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
#define INIT_m4n4 init_m4n4(4,5,6,7)
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
#define init_m2n4(c1,c2)\
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
#define INIT_m2n4 init_m2n4(4,5)
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
#define INIT_m1n4 init_m1n4(4)
#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
#define GEMM_KERNEL_k1m8n4 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
#define GEMM_KERNEL_k1m4n4 \
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
#define GEMM_KERNEL_k1m2n4 \
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
"vunpcklps %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhps %%ymm"#c4",%%ymm"#c3",%%ymm3;"\
"vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\
"vaddps %%ymm0,%%ymm"#c3",%%ymm0; vaddps %%ymm1,%%ymm"#c4",%%ymm1;"\
"leaq (%3,%4,2),%3;"\
"vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\
"vaddps %%ymm2,%%ymm"#c3",%%ymm2; vaddps %%ymm3,%%ymm"#c4",%%ymm3;"\
"leaq (%3,%4,2),%3;"\
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\
"vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
"vunpcklps %%xmm"#c4",%%xmm"#c3",%%xmm2; vunpckhps %%xmm"#c4",%%xmm"#c3",%%xmm3;"\
"vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\
"vaddps %%xmm0,%%xmm"#c3",%%xmm0; vaddps %%xmm1,%%xmm"#c4",%%xmm1;"\
"leaq (%3,%4,2),%3;"\
"vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\
"vaddps %%xmm2,%%xmm"#c3",%%xmm2; vaddps %%xmm3,%%xmm"#c4",%%xmm3;"\
"leaq (%3,%4,2),%3;"\
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#co1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#co2";"
#define GEMM_SUM_REORDER_2x4(c1,c2,co1)\
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
"vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm0,%%xmm2,%%xmm0; leaq (%3,%4,2),%3;"\
"vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm1,%%xmm2,%%xmm1; leaq (%3,%4,2),%3;"\
"vperm2f128 $2,%%ymm0,%%ymm1,%%ymm"#co1";"
#define GEMM_SUM_REORDER_1x4(c1)\
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
#define save_c_m8n4(c1,c2,c3,c4)\
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
"vunpcklpd %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhpd %%ymm"#c4",%%ymm"#c3",%%ymm3;"\
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
"vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"\
"vmovups %%ymm"#c3",(%3); vmovups %%ymm"#c4",(%3,%4,1); leaq (%3,%4,2),%3;"
#define save_c_m4n4(c1,c2)\
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
"vmovups %%xmm0,(%3); vmovups %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"\
"vextractf128 $1,%%ymm0,(%3); vextractf128 $1,%%ymm1,(%3,%4,1); leaq (%3,%4,2),%3;"
#define save_c_m2n4(c1)\
"vextractf128 $1,%%ymm"#c1",%%xmm1; vmovsd %%xmm"#c1",(%3); vmovhpd %%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
"vmovsd %%xmm1,(%3); vmovhpd %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"
#define save_c_m1n4(c1)\
"vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
#define SOLVE_up_m2n4(a_off,c1)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
"vmovsldup %%ymm"#c1",%%ymm1;"
#define SOLVE_up_m2n8(a_off,c1,c2)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
#define SOLVE_up_m2n12(a_off,c1,c2,c3)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2; vmovsldup %%ymm"#c3",%%ymm3;"
#define SOLVE_uplo_m2n4(a_off,c1) SOLVE_up_m2n4(a_off,c1)\
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SOLVE_uplo_m2n8(a_off,c1,c2) SOLVE_up_m2n8(a_off,c1,c2)\
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SOLVE_uplo_m2n12(a_off,c1,c2,c3) SOLVE_up_m2n12(a_off,c1,c2,c3)\
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
#define SOLVE_lo_m2n4(a_off,c1)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
"vmovshdup %%ymm"#c1",%%ymm1;"
#define SOLVE_lo_m2n8(a_off,c1,c2)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
#define SOLVE_lo_m2n12(a_off,c1,c2,c3)\
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2; vmovshdup %%ymm"#c3",%%ymm3;"
#define SOLVE_loup_m2n4(a_off,c1) SOLVE_lo_m2n4(a_off,c1)\
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SOLVE_loup_m2n8(a_off,c1,c2) SOLVE_lo_m2n8(a_off,c1,c2)\
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SOLVE_loup_m2n12(a_off,c1,c2,c3) SOLVE_lo_m2n12(a_off,c1,c2,c3)\
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
#define SOLVE_m1n4(a_off,c1) "vbroadcastss "#a_off"(%0),%%xmm0; vmulps %%xmm0,%%xmm"#c1",%%xmm"#c1";"
#define SOLVE_m1n8(a_off,c1,c2) SOLVE_m1n4(a_off,c1) "vmulps %%xmm0,%%xmm"#c2",%%xmm"#c2";"
#define SOLVE_m1n12(a_off,c1,c2,c3) SOLVE_m1n8(a_off,c1,c2) "vmulps %%xmm0,%%xmm"#c3",%%xmm"#c3";"
#define SUBTRACT_m2n4(a_off,c1) "vbroadcastsd "#a_off"(%0),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SUBTRACT_m2n8(a_off,c1,c2) SUBTRACT_m2n4(a_off,c1) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SUBTRACT_m2n12(a_off,c1,c2,c3) SUBTRACT_m2n8(a_off,c1,c2) "vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
#define save_b_m2n4(c1,tmp,b_off,...)\
"vpermilps $216,%%ymm"#c1",%%ymm"#tmp"; vpermpd $216,%%ymm"#tmp",%%ymm"#tmp"; vmovups %%ymm"#tmp","#b_off"("#__VA_ARGS__");"
#define SAVE_b_m2n4(b_off,c1) save_b_m2n4(c1,1,b_off,%1)
#define SAVE_b_m2n8(b_off,c1,c2) SAVE_b_m2n4(b_off,c1) save_b_m2n4(c2,2,b_off,%1,%%r12,4)
#define SAVE_b_m2n12(b_off,c1,c2,c3) SAVE_b_m2n8(b_off,c1,c2) save_b_m2n4(c3,3,b_off,%1,%%r12,8)
#define SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c1","#b_off"(%1);"
#define SAVE_b_m1n8(b_off,c1,c2) SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c2","#b_off"(%1,%%r12,4);"
#define SAVE_b_m1n12(b_off,c1,c2,c3) SAVE_b_m1n8(b_off,c1,c2) "vmovups %%xmm"#c3","#b_off"(%1,%%r12,8);"

View File

@ -0,0 +1,279 @@
#include "common.h"
#include <stdint.h>
#include "strsm_kernel_8x4_haswell_R_common.h"
#define SOLVE_RN_m8n4 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
SAVE_SOLUTION_m8n2(4,5,0)\
SOLVE_leri_m8n2(40,6,7,%1)\
SOLVE_ri_m8n2(56,6,7,%1)\
SAVE_SOLUTION_m8n2(6,7,64)
#define SOLVE_RN_m8n8 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
SAVE_SOLUTION_m8n2(4,5,0)\
SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
SAVE_SOLUTION_m8n2(6,7,64)\
SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
SAVE_SOLUTION_m8n2(8,9,128)\
SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
SAVE_SOLUTION_m8n2(10,11,192)
#define SOLVE_RN_m8n12 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(4,5,0)\
SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(6,7,64)\
SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(8,9,128)\
SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(10,11,192)\
SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(12,13,256)\
SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
SAVE_SOLUTION_m8n2(14,15,320)
#define SOLVE_RN_m4n4 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
SAVE_SOLUTION_m4n2(4,0)\
SOLVE_leri_m4n2(40,5,%1)\
SOLVE_ri_m4n2(56,5,%1)\
SAVE_SOLUTION_m4n2(5,32)
#define SOLVE_RN_m4n8 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
SAVE_SOLUTION_m4n2(4,0)\
SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
SAVE_SOLUTION_m4n2(5,32)\
SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
SAVE_SOLUTION_m4n2(6,64)\
SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
SAVE_SOLUTION_m4n2(7,96)
#define SOLVE_RN_m4n12 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(4,0)\
SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(5,32)\
SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(6,64)\
SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(7,96)\
SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(8,128)\
SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
SAVE_SOLUTION_m4n2(9,160)
#define SOLVE_RN_m2n4 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
SOLVE_col1_ltor_m2n4(0,4,5,%1)\
SOLVE_col2_ltor_m2n4(16,4,5,%1)\
SOLVE_col3_ltor_m2n4(32,4,5,%1)\
SOLVE_col4_ltor_m2n4(48,4,5,%1)\
SAVE_SOLUTION_m2n4(4,5,0)
#define SOLVE_RN_m2n8 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
SAVE_SOLUTION_m2n4(4,5,0)\
SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
SAVE_SOLUTION_m2n4(6,7,32)
#define SOLVE_RN_m2n12 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
SAVE_SOLUTION_m2n4(4,5,0)\
SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
SAVE_SOLUTION_m2n4(6,7,32)\
SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
SAVE_SOLUTION_m2n4(8,9,64)
#define SOLVE_RN_m1n4 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
SOLVE_col1_ltor_m1n4(0,4,%1)\
SOLVE_col2_ltor_m1n4(16,4,%1)\
SOLVE_col3_ltor_m1n4(32,4,%1)\
SOLVE_col4_ltor_m1n4(48,4,%1)\
SAVE_SOLUTION_m1n4(4,0)
#define SOLVE_RN_m1n8 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
SAVE_SOLUTION_m1n4(4,0)\
SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
SAVE_SOLUTION_m1n4(5,16)
#define SOLVE_RN_m1n12 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
SAVE_SOLUTION_m1n4(4,0)\
SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
SAVE_SOLUTION_m1n4(5,16)\
SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
SAVE_SOLUTION_m1n4(6,32)
#define GEMM_RN_SIMPLE(mdim,ndim) \
"movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
"1"#mdim""#ndim"1:\n\t"\
GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
"1"#mdim""#ndim"2:\n\t"
#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
#define GEMM_RN_m8n12 \
"movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
"cmpq $8,%5; jb 18122f;"\
"18121:\n\t"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
"18122:\n\t"\
"testq %5,%5; jz 18124f;"\
"18123:\n\t"\
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
"18124:\n\t"
#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
#define COMPUTE(ndim) {\
__asm__ __volatile__(\
"movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
"cmpq $8,%%r11; jb "#ndim"772f;"\
#ndim"771:\n\t"\
GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
#ndim"772:\n\t"\
"testq $4,%%r11; jz "#ndim"773f;"\
GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
#ndim"773:\n\t"\
"testq $2,%%r11; jz "#ndim"774f;"\
GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
#ndim"774:\n\t"\
"testq $1,%%r11; jz "#ndim"775f;"\
GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
#ndim"775:\n\t"\
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
:"r11","r12","r13","r14","r15","cc","memory",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
}
static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
FLOAT a0, b0;
int i, j, k;
for (i=0; i<n; i++) {
b0 = b[i*n+i];
for (j=0; j<m; j++) {
a0 = c[i*ldc+j] * b0;
a[i*m+j] = c[i*ldc+j] = a0;
for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
}
}
}
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
for(;m_count>7;m_count-=8){
if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
a_ptr += k * 8; c_ptr += 8;
}
for(;m_count>3;m_count-=4){
if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
a_ptr += k * 4; c_ptr += 4;
}
for(;m_count>1;m_count-=2){
if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
a_ptr += k * 2; c_ptr += 2;
}
if(m_count>0){
if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
a_ptr += k * 1; c_ptr += 1;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
BLASLONG n_count = n;
for(;n_count>11;n_count-=12) COMPUTE(12)
for(;n_count>7;n_count-=8) COMPUTE(8)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
return 0;
}

View File

@ -0,0 +1,281 @@
#include "common.h"
#include <stdint.h>
#include "strsm_kernel_8x4_haswell_R_common.h"
#define SOLVE_RT_m8n4 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-48,4,5,%1)\
SOLVE_le_m8n2(-64,4,5,%1)\
SAVE_SOLUTION_m8n2(4,5,-128)
#define SOLVE_RT_m8n8 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-112,4,5,%1)\
SOLVE_le_m8n2(-128,4,5,%1)\
SAVE_SOLUTION_m8n2(4,5,-256)
#define SOLVE_RT_m8n12 \
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m8n2(-176,4,5,%1)\
SOLVE_le_m8n2(-192,4,5,%1)\
SAVE_SOLUTION_m8n2(4,5,-384)
#define SOLVE_RT_m4n4 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-48,4,%1)\
SOLVE_le_m4n2(-64,4,%1)\
SAVE_SOLUTION_m4n2(4,-64)
#define SOLVE_RT_m4n8 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-112,4,%1)\
SOLVE_le_m4n2(-128,4,%1)\
SAVE_SOLUTION_m4n2(4,-128)
#define SOLVE_RT_m4n12 \
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
SOLVE_rile_m4n2(-176,4,%1)\
SOLVE_le_m4n2(-192,4,%1)\
SAVE_SOLUTION_m4n2(4,-192)
#define SOLVE_RT_m2n4 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
SAVE_SOLUTION_m2n4(4,5,-32)
#define SOLVE_RT_m2n8 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
SAVE_SOLUTION_m2n4(4,5,-64)
#define SOLVE_RT_m2n12 \
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
SAVE_SOLUTION_m2n4(4,5,-96)
#define SOLVE_RT_m1n4 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
SOLVE_col4_rtol_m1n4(-16,4,%1)\
SOLVE_col3_rtol_m1n4(-32,4,%1)\
SOLVE_col2_rtol_m1n4(-48,4,%1)\
SOLVE_col1_rtol_m1n4(-64,4,%1)\
SAVE_SOLUTION_m1n4(4,-16)
#define SOLVE_RT_m1n8 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m1n4(-80,4,%1)\
SOLVE_col3_rtol_m1n4(-96,4,%1)\
SOLVE_col2_rtol_m1n4(-112,4,%1)\
SOLVE_col1_rtol_m1n4(-128,4,%1)\
SAVE_SOLUTION_m1n4(4,-32)
#define SOLVE_RT_m1n12 \
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
SOLVE_col4_rtol_m1n4(-144,4,%1)\
SOLVE_col3_rtol_m1n4(-160,4,%1)\
SOLVE_col2_rtol_m1n4(-176,4,%1)\
SOLVE_col1_rtol_m1n4(-192,4,%1)\
SAVE_SOLUTION_m1n4(4,-48)
/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
#define GEMM_RT_SIMPLE(mdim,ndim) \
"leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
"1"#mdim""#ndim"1:\n\t"\
"subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
"1"#mdim""#ndim"2:\n\t"
#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
#define GEMM_RT_m8n12 \
"leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
"cmpq $8,%5; jb 18122f;"\
"18121:\n\t"\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
"18122:\n\t"\
"testq %5,%5; jz 18124f;"\
"18123:\n\t"\
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
"18124:\n\t"
#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
#define COMPUTE(ndim) {\
b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
__asm__ __volatile__(\
"movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
"cmpq $8,%%r11; jb "#ndim"772f;"\
#ndim"771:\n\t"\
GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
#ndim"772:\n\t"\
"testq $4,%%r11; jz "#ndim"773f;"\
GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
#ndim"773:\n\t"\
"testq $2,%%r11; jz "#ndim"774f;"\
GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
#ndim"774:\n\t"\
"testq $1,%%r11; jz "#ndim"775f;"\
GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
#ndim"775:\n\t"\
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
:"r11","r12","r13","r14","r15","cc","memory",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
}
static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
FLOAT a0, b0;
int i, j, k;
for (i=n-1;i>=0;i--) {
b0 = b[i*n+i];
for (j=0;j<m;j++) {
a0 = c[i*ldc+j] * b0;
a[i*m+j] = c[i*ldc+j] = a0;
for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
}
}
}
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
for(;m_count>7;m_count-=8){
if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
a_ptr += k * 8; c_ptr += 8;
}
for(;m_count>3;m_count-=4){
if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
a_ptr += k * 4; c_ptr += 4;
}
for(;m_count>1;m_count-=2){
if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
a_ptr += k * 2; c_ptr += 2;
}
if(m_count>0){
if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
a_ptr += k * 1; c_ptr += 1;
}
}
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
BLASLONG n_count = n;
if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
for(;n_count>11;n_count-=12) COMPUTE(12)
for(;n_count>7;n_count-=8) COMPUTE(8)
for(;n_count>3;n_count-=4) COMPUTE(4)
return 0;
}

View File

@ -0,0 +1,226 @@
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
#define init_m8n4(c1,c2,c3,c4)\
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
#define INIT_m8n4 init_m8n4(4,5,6,7)
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
#define init_m4n4(c1,c2,c3,c4)\
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
#define INIT_m4n4 init_m4n4(4,5,6,7)
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
#define init_m2n4(c1,c2)\
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
#define INIT_m2n4 init_m2n4(4,5)
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
#define INIT_m1n4 init_m1n4(4)
#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
#define GEMM_KERNEL_k1m8n4 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
#define GEMM_KERNEL_k1m4n4 \
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
#define GEMM_KERNEL_k1m2n4 \
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
#define GEMM_SUM_REORDER_2x4(c1,c2)\
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
#define GEMM_SUM_REORDER_1x4(c1)\
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
#define SOLVE_le_m4n2(b_off,c1,...)\
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
"vmovsldup %%ymm"#c1",%%ymm1;"
#define SOLVE_le_m8n2(b_off,c1,c2,...)\
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SOLVE_ri_m4n2(b_off,c1,...)\
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
"vmovshdup %%ymm"#c1",%%ymm1;"
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
"vpermilps $0,%%xmm"#c1",%%xmm1;"
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
"vpermilps $85,%%xmm"#c1",%%xmm1;"
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
"vpermilps $170,%%xmm"#c1",%%xmm1;"
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
"vpermilps $255,%%xmm"#c1",%%xmm1;"
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
#define SAVE_SOLUTION_m4n2(c1,a_off)\
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
#define SAVE_SOLUTION_m1n4(c1,a_off)\
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"

View File

@ -12,27 +12,6 @@
#include <stdlib.h>
#ifdef __cplusplus
extern "C" {
#endif
/*----------------------------------------------------------------------------*/
#ifndef lapack_int
#define lapack_int int
#endif
#ifndef lapack_logical
#define lapack_logical lapack_int
#endif
/* f2c, hence clapack and MacOS Accelerate, returns double instead of float
* for sdot, slange, clange, etc. */
#if defined(LAPACK_F2C)
typedef double lapack_float_return;
#else
typedef float lapack_float_return;
#endif
/* Complex types are structures equivalent to the
* Fortran complex types COMPLEX(4) and COMPLEX(8).
*
@ -88,6 +67,29 @@ extern "C" {
#endif /* LAPACK_COMPLEX_CUSTOM */
#ifdef __cplusplus
extern "C" {
#endif
/*----------------------------------------------------------------------------*/
#ifndef lapack_int
#define lapack_int int
#endif
#ifndef lapack_logical
#define lapack_logical lapack_int
#endif
/* f2c, hence clapack and MacOS Accelerate, returns double instead of float
* for sdot, slange, clange, etc. */
#if defined(LAPACK_F2C)
typedef double lapack_float_return;
#else
typedef float lapack_float_return;
#endif
/* Callback logical functions of one, two, or three arguments are used
* to select eigenvalues to sort to the top left of the Schur form.
* The value is selected if function returns TRUE (non-zero). */

View File

@ -282,7 +282,8 @@
$ NPLUSONE
* ..
* .. External Subroutines ..
EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA
EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM,
$XERBLA
* ..
* .. Intrinsic Functions ..
INTRINSIC MAX, MIN
@ -436,4 +437,4 @@
*
* End of SORHR_COL
*
END
END

View File

@ -1,7 +1,12 @@
TOPDIR = ..
include ../Makefile.system
ifeq ($(NOFORTRAN),1)
all ::
else
all :: level1 level2 level3
endif
level1 : sblat1 dblat1 cblat1 zblat1
ifndef CROSS