commit
c0da205412
|
@ -635,6 +635,16 @@ endif
|
||||||
ifeq ($(ARCH), arm64)
|
ifeq ($(ARCH), arm64)
|
||||||
NO_BINARY_MODE = 1
|
NO_BINARY_MODE = 1
|
||||||
BINARY_DEFINED = 1
|
BINARY_DEFINED = 1
|
||||||
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
|
ifeq ($(F_COMPILER), GFORTRAN)
|
||||||
|
FCOMMON_OPT += -fdefault-integer-8
|
||||||
|
endif
|
||||||
|
ifeq ($(F_COMPILER), FLANG)
|
||||||
|
FCOMMON_OPT += -i8
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -78,8 +78,13 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
chemm.goto zhemm.goto \
|
chemm.goto zhemm.goto \
|
||||||
cherk.goto zherk.goto \
|
cherk.goto zherk.goto \
|
||||||
cher2k.goto zher2k.goto \
|
cher2k.goto zher2k.goto \
|
||||||
|
cher.goto zher.goto \
|
||||||
|
cher2.goto zher2.goto \
|
||||||
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
|
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
|
||||||
|
sspmv.goto dspmv.goto \
|
||||||
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
|
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
|
||||||
|
stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
|
||||||
|
stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
|
||||||
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
|
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
|
||||||
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
sgeev.goto dgeev.goto cgeev.goto zgeev.goto \
|
||||||
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
|
sgesv.goto dgesv.goto cgesv.goto zgesv.goto \
|
||||||
|
@ -115,8 +120,12 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
chemm.acml zhemm.acml \
|
chemm.acml zhemm.acml \
|
||||||
cherk.acml zherk.acml \
|
cherk.acml zherk.acml \
|
||||||
cher2k.acml zher2k.acml \
|
cher2k.acml zher2k.acml \
|
||||||
|
cher.acml zher.acml \
|
||||||
|
cher2.acml zher2.acml \
|
||||||
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
|
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
|
||||||
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
|
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
|
||||||
|
stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
|
||||||
|
stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
|
||||||
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
|
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
|
||||||
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
|
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
|
||||||
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
|
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
|
||||||
|
@ -153,8 +162,13 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
||||||
chemm.atlas zhemm.atlas \
|
chemm.atlas zhemm.atlas \
|
||||||
cherk.atlas zherk.atlas \
|
cherk.atlas zherk.atlas \
|
||||||
cher2k.atlas zher2k.atlas \
|
cher2k.atlas zher2k.atlas \
|
||||||
|
cher.atlas zher.atlas \
|
||||||
|
cher2.atlas zher2.atlas \
|
||||||
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
|
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
|
||||||
|
sspmv.atlas dspmv.atlas \
|
||||||
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
|
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
|
||||||
|
stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
|
||||||
|
stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
|
||||||
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
|
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
|
||||||
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
|
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
|
||||||
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
||||||
|
@ -190,8 +204,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||||
chemm.mkl zhemm.mkl \
|
chemm.mkl zhemm.mkl \
|
||||||
cherk.mkl zherk.mkl \
|
cherk.mkl zherk.mkl \
|
||||||
cher2k.mkl zher2k.mkl \
|
cher2k.mkl zher2k.mkl \
|
||||||
|
cher.mkl zher.mkl \
|
||||||
|
cher2.mkl zher2.mkl \
|
||||||
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
|
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
|
||||||
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
|
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
|
||||||
|
stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
|
||||||
|
stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
|
||||||
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
|
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
|
||||||
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
|
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
|
||||||
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
|
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
|
||||||
|
@ -227,8 +245,13 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
||||||
chemm.goto zhemm.goto \
|
chemm.goto zhemm.goto \
|
||||||
cherk.goto zherk.goto \
|
cherk.goto zherk.goto \
|
||||||
cher2k.goto zher2k.goto \
|
cher2k.goto zher2k.goto \
|
||||||
|
cher.goto zher.goto \
|
||||||
|
cher2.goto zher2.goto \
|
||||||
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
|
sgemv.goto dgemv.goto cgemv.goto zgemv.goto \
|
||||||
|
sspmv.goto dspmv.goto \
|
||||||
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
|
strmv.goto dtrmv.goto ctrmv.goto ztrmv.goto \
|
||||||
|
stpmv.goto dtpmv.goto ctpmv.goto ztpmv.goto \
|
||||||
|
stpsv.goto dtpsv.goto ctpsv.goto ztpsv.goto \
|
||||||
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
|
strsv.goto dtrsv.goto ctrsv.goto ztrsv.goto \
|
||||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||||
smallscaling \
|
smallscaling \
|
||||||
|
@ -270,8 +293,12 @@ acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||||
chemm.acml zhemm.acml \
|
chemm.acml zhemm.acml \
|
||||||
cherk.acml zherk.acml \
|
cherk.acml zherk.acml \
|
||||||
cher2k.acml zher2k.acml \
|
cher2k.acml zher2k.acml \
|
||||||
|
cher.acml zher.acml \
|
||||||
|
cher2.acml zher2.acml \
|
||||||
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
|
sgemv.acml dgemv.acml cgemv.acml zgemv.acml \
|
||||||
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
|
strmv.acml dtrmv.acml ctrmv.acml ztrmv.acml \
|
||||||
|
stpmv.acml dtpmv.acml ctpmv.acml ztpmv.acml \
|
||||||
|
stpsv.acml dtpsv.acml ctpsv.acml ztpsv.acml \
|
||||||
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
|
strsv.acml dtrsv.acml ctrsv.acml ztrsv.acml \
|
||||||
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
|
sgeev.acml dgeev.acml cgeev.acml zgeev.acml \
|
||||||
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
|
sgesv.acml dgesv.acml cgesv.acml zgesv.acml \
|
||||||
|
@ -308,8 +335,13 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \
|
||||||
chemm.atlas zhemm.atlas \
|
chemm.atlas zhemm.atlas \
|
||||||
cherk.atlas zherk.atlas \
|
cherk.atlas zherk.atlas \
|
||||||
cher2k.atlas zher2k.atlas \
|
cher2k.atlas zher2k.atlas \
|
||||||
|
cher.atlas zher.atlas \
|
||||||
|
cher2.atlas zher2.atlas \
|
||||||
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
|
sgemv.atlas dgemv.atlas cgemv.atlas zgemv.atlas \
|
||||||
|
sspmv.atlas dspmv.atlas \
|
||||||
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
|
strmv.atlas dtrmv.atlas ctrmv.atlas ztrmv.atlas \
|
||||||
|
stpmv.atlas dtpmv.atlas ctpmv.atlas ztpmv.atlas \
|
||||||
|
stpsv.atlas dtpsv.atlas ctpsv.atlas ztpsv.atlas \
|
||||||
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
|
strsv.atlas dtrsv.atlas ctrsv.atlas ztrsv.atlas \
|
||||||
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
|
sgeev.atlas dgeev.atlas cgeev.atlas zgeev.atlas \
|
||||||
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \
|
||||||
|
@ -347,8 +379,12 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||||
chemm.mkl zhemm.mkl \
|
chemm.mkl zhemm.mkl \
|
||||||
cherk.mkl zherk.mkl \
|
cherk.mkl zherk.mkl \
|
||||||
cher2k.mkl zher2k.mkl \
|
cher2k.mkl zher2k.mkl \
|
||||||
|
cher.mkl zher.mkl \
|
||||||
|
cher2.mkl zher2.mkl \
|
||||||
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
|
sgemv.mkl dgemv.mkl cgemv.mkl zgemv.mkl \
|
||||||
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
|
strmv.mkl dtrmv.mkl ctrmv.mkl ztrmv.mkl \
|
||||||
|
stpmv.mkl dtpmv.mkl ctpmv.mkl ztpmv.mkl \
|
||||||
|
stpsv.mkl dtpsv.mkl ctpsv.mkl ztpsv.mkl \
|
||||||
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
|
strsv.mkl dtrsv.mkl ctrsv.mkl ztrsv.mkl \
|
||||||
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
|
sgeev.mkl dgeev.mkl cgeev.mkl zgeev.mkl \
|
||||||
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
|
sgesv.mkl dgesv.mkl cgesv.mkl zgesv.mkl \
|
||||||
|
@ -395,8 +431,12 @@ veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||||
chemm.veclib zhemm.veclib \
|
chemm.veclib zhemm.veclib \
|
||||||
cherk.veclib zherk.veclib \
|
cherk.veclib zherk.veclib \
|
||||||
cher2k.veclib zher2k.veclib \
|
cher2k.veclib zher2k.veclib \
|
||||||
|
cher.veclib zher.veclib \
|
||||||
|
cher2.veclib zher2.veclib \
|
||||||
sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \
|
sgemv.veclib dgemv.veclib cgemv.veclib zgemv.veclib \
|
||||||
strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \
|
strmv.veclib dtrmv.veclib ctrmv.veclib ztrmv.veclib \
|
||||||
|
stpmv.veclib dtpmv.veclib ctpmv.veclib ztpmv.veclib \
|
||||||
|
stpsv.veclib dtpsv.veclib ctpsv.veclib ztpsv.veclib \
|
||||||
strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \
|
strsv.veclib dtrsv.veclib ctrsv.veclib ztrsv.veclib \
|
||||||
sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \
|
sgeev.veclib dgeev.veclib cgeev.veclib zgeev.veclib \
|
||||||
sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \
|
sgesv.veclib dgesv.veclib cgesv.veclib zgesv.veclib \
|
||||||
|
@ -1231,6 +1271,74 @@ zher2k.mkl : zher2k.$(SUFFIX)
|
||||||
zher2k.veclib : zher2k.$(SUFFIX)
|
zher2k.veclib : zher2k.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Cher ####################################################
|
||||||
|
|
||||||
|
cher.goto : cher.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
cher.acml : cher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher.atlas : cher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher.mkl : cher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher.veclib : cher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Zher ####################################################
|
||||||
|
|
||||||
|
zher.goto : zher.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
zher.acml : zher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher.atlas : zher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher.mkl : zher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher.veclib : zher.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Cher2 ####################################################
|
||||||
|
|
||||||
|
cher2.goto : cher2.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
cher2.acml : cher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher2.atlas : cher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher2.mkl : cher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cher2.veclib : cher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Zher2 ####################################################
|
||||||
|
|
||||||
|
zher2.goto : zher2.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
zher2.acml : zher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher2.atlas : zher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher2.mkl : zher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zher2.veclib : zher2.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Sgemv ####################################################
|
##################################### Sgemv ####################################################
|
||||||
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
|
sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -1297,6 +1405,20 @@ zgemv.mkl : zgemv.$(SUFFIX)
|
||||||
zgemv.veclib : zgemv.$(SUFFIX)
|
zgemv.veclib : zgemv.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Sspmv ####################################################
|
||||||
|
sspmv.goto : sspmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
sspmv.atlas : sspmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Dspmv ####################################################
|
||||||
|
dspmv.goto : dspmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
dspmv.atlas : dspmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Strmv ####################################################
|
##################################### Strmv ####################################################
|
||||||
strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME)
|
strmv.goto : strmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -1363,6 +1485,139 @@ ztrmv.mkl : ztrmv.$(SUFFIX)
|
||||||
ztrmv.veclib : ztrmv.$(SUFFIX)
|
ztrmv.veclib : ztrmv.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
|
||||||
|
##################################### Stpmv ####################################################
|
||||||
|
stpmv.goto : stpmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
stpmv.acml : stpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpmv.atlas : stpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpmv.mkl : stpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpmv.veclib : stpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Dtpmv ####################################################
|
||||||
|
dtpmv.goto : dtpmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
dtpmv.acml : dtpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpmv.atlas : dtpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpmv.mkl : dtpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpmv.veclib : dtpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Ctpmv ####################################################
|
||||||
|
|
||||||
|
ctpmv.goto : ctpmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
ctpmv.acml : ctpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpmv.atlas : ctpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpmv.mkl : ctpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpmv.veclib : ctpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Ztpmv ####################################################
|
||||||
|
|
||||||
|
ztpmv.goto : ztpmv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
ztpmv.acml : ztpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpmv.atlas : ztpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpmv.mkl : ztpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpmv.veclib : ztpmv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Stpsv ####################################################
|
||||||
|
stpsv.goto : stpsv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
stpsv.acml : stpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpsv.atlas : stpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpsv.mkl : stpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
stpsv.veclib : stpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Dtpsv ####################################################
|
||||||
|
dtpsv.goto : dtpsv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
dtpsv.acml : dtpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpsv.atlas : dtpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpsv.mkl : dtpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtpsv.veclib : dtpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Ctpsv ####################################################
|
||||||
|
|
||||||
|
ctpsv.goto : ctpsv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
ctpsv.acml : ctpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpsv.atlas : ctpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpsv.mkl : ctpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctpsv.veclib : ctpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
##################################### Ztpsv ####################################################
|
||||||
|
|
||||||
|
ztpsv.goto : ztpsv.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
|
||||||
|
ztpsv.acml : ztpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpsv.atlas : ztpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpsv.mkl : ztpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztpsv.veclib : ztpsv.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Strsv ####################################################
|
##################################### Strsv ####################################################
|
||||||
strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME)
|
strsv.goto : strsv.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -2775,6 +3030,18 @@ cher2k.$(SUFFIX) : her2k.c
|
||||||
zher2k.$(SUFFIX) : her2k.c
|
zher2k.$(SUFFIX) : her2k.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
cher.$(SUFFIX) : her.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
zher.$(SUFFIX) : her.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
cher2.$(SUFFIX) : her2.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
zher2.$(SUFFIX) : her2.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
sgemv.$(SUFFIX) : gemv.c
|
sgemv.$(SUFFIX) : gemv.c
|
||||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
@ -2787,6 +3054,12 @@ cgemv.$(SUFFIX) : gemv.c
|
||||||
zgemv.$(SUFFIX) : gemv.c
|
zgemv.$(SUFFIX) : gemv.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
sspmv.$(SUFFIX) : spmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
dspmv.$(SUFFIX) : spmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
strmv.$(SUFFIX) : trmv.c
|
strmv.$(SUFFIX) : trmv.c
|
||||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
@ -2799,6 +3072,30 @@ ctrmv.$(SUFFIX) : trmv.c
|
||||||
ztrmv.$(SUFFIX) : trmv.c
|
ztrmv.$(SUFFIX) : trmv.c
|
||||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
stpmv.$(SUFFIX) : tpmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
dtpmv.$(SUFFIX) : tpmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
ctpmv.$(SUFFIX) : tpmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
ztpmv.$(SUFFIX) : tpmv.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
stpsv.$(SUFFIX) : tpsv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
dtpsv.$(SUFFIX) : tpsv.c
|
||||||
|
$(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
ctpsv.$(SUFFIX) : tpsv.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
ztpsv.$(SUFFIX) : tpsv.c
|
||||||
|
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
strsv.$(SUFFIX) : trsv.c
|
strsv.$(SUFFIX) : trsv.c
|
||||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||||
|
|
||||||
|
@ -3124,4 +3421,3 @@ clean ::
|
||||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
|
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling
|
||||||
|
|
||||||
include $(TOPDIR)/Makefile.tail
|
include $(TOPDIR)/Makefile.tail
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,186 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2020, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#undef HER
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define HER BLASFUNC(zher)
|
||||||
|
#else
|
||||||
|
#define HER BLASFUNC(cher)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__WIN64__)
|
||||||
|
|
||||||
|
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||||
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
|
FILETIME ft;
|
||||||
|
unsigned __int64 tmpres = 0;
|
||||||
|
static int tzflag;
|
||||||
|
|
||||||
|
if (NULL != tv)
|
||||||
|
{
|
||||||
|
GetSystemTimeAsFileTime(&ft);
|
||||||
|
|
||||||
|
tmpres |= ft.dwHighDateTime;
|
||||||
|
tmpres <<= 32;
|
||||||
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
|
/*converting file time to unix epoch*/
|
||||||
|
tmpres /= 10; /*convert into microseconds*/
|
||||||
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size){
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1){
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
|
FLOAT *a, *x;
|
||||||
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
blasint incx = 1;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
char uplo='U';
|
||||||
|
char trans='N';
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||||
|
|
||||||
|
blasint m, i, j;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timeval start, stop;
|
||||||
|
double time1;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
|
||||||
|
|
||||||
|
|
||||||
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(m = from; m <= to; m += step)
|
||||||
|
{
|
||||||
|
fprintf(stderr, " %6d : ", (int)m);
|
||||||
|
|
||||||
|
for(j = 0; j < m; j++){
|
||||||
|
for(i = 0; i < m * COMPSIZE; i++){
|
||||||
|
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
HER (&uplo, &m, alpha, x, &incx, a, &m );
|
||||||
|
|
||||||
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
fprintf(stderr,
|
||||||
|
" %10.2f MFlops\n",
|
||||||
|
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / time1 * 1.e-6);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,190 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2020, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#undef HER2
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define HER2 BLASFUNC(zher2)
|
||||||
|
#else
|
||||||
|
#define HER2 BLASFUNC(cher2)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__WIN64__)
|
||||||
|
|
||||||
|
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||||
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
|
FILETIME ft;
|
||||||
|
unsigned __int64 tmpres = 0;
|
||||||
|
static int tzflag;
|
||||||
|
|
||||||
|
if (NULL != tv)
|
||||||
|
{
|
||||||
|
GetSystemTimeAsFileTime(&ft);
|
||||||
|
|
||||||
|
tmpres |= ft.dwHighDateTime;
|
||||||
|
tmpres <<= 32;
|
||||||
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
|
/*converting file time to unix epoch*/
|
||||||
|
tmpres /= 10; /*convert into microseconds*/
|
||||||
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size){
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1){
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
|
FLOAT *a, *x, *y;
|
||||||
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
blasint inc = 1;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
char uplo='U';
|
||||||
|
char trans='N';
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||||
|
|
||||||
|
blasint m, i, j;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timeval start, stop;
|
||||||
|
double time1;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c\n", from, to, step,uplo,trans);
|
||||||
|
|
||||||
|
|
||||||
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(m = from; m <= to; m += step)
|
||||||
|
{
|
||||||
|
fprintf(stderr, " %6d : ", (int)m);
|
||||||
|
|
||||||
|
for(j = 0; j < m; j++){
|
||||||
|
for(i = 0; i < m * COMPSIZE; i++){
|
||||||
|
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
|
||||||
|
HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
|
||||||
|
|
||||||
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
fprintf(stderr,
|
||||||
|
" %10.2f MFlops\n",
|
||||||
|
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -181,7 +181,7 @@ int main(int argc, char *argv[]){
|
||||||
timeg /= loops;
|
timeg /= loops;
|
||||||
|
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" %10.2f MFlops %10.6f sec\n",
|
" %10.2f MBytes %10.6f sec\n",
|
||||||
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,219 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
|
||||||
|
#undef SPMV
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define SPMV BLASFUNC(dspmv)
|
||||||
|
#else
|
||||||
|
#define SPMV BLASFUNC(sspmv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define SPMV BLASFUNC(zspmv)
|
||||||
|
#else
|
||||||
|
#define SPMV BLASFUNC(cspmv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__WIN32__) || defined(__WIN64__)
|
||||||
|
|
||||||
|
#ifndef DELTA_EPOCH_IN_MICROSECS
|
||||||
|
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int gettimeofday(struct timeval *tv, void *tz){
|
||||||
|
|
||||||
|
FILETIME ft;
|
||||||
|
unsigned __int64 tmpres = 0;
|
||||||
|
static int tzflag;
|
||||||
|
|
||||||
|
if (NULL != tv)
|
||||||
|
{
|
||||||
|
GetSystemTimeAsFileTime(&ft);
|
||||||
|
|
||||||
|
tmpres |= ft.dwHighDateTime;
|
||||||
|
tmpres <<= 32;
|
||||||
|
tmpres |= ft.dwLowDateTime;
|
||||||
|
|
||||||
|
/*converting file time to unix epoch*/
|
||||||
|
tmpres /= 10; /*convert into microseconds*/
|
||||||
|
tmpres -= DELTA_EPOCH_IN_MICROSECS;
|
||||||
|
tv->tv_sec = (long)(tmpres / 1000000UL);
|
||||||
|
tv->tv_usec = (long)(tmpres % 1000000UL);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size){
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1){
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[]){
|
||||||
|
|
||||||
|
FLOAT *a, *x, *y;
|
||||||
|
FLOAT alpha[] = {1.0, 1.0};
|
||||||
|
FLOAT beta [] = {1.0, 1.0};
|
||||||
|
char uplo='L';
|
||||||
|
blasint m, i, j;
|
||||||
|
blasint inc_x=1,inc_y=1;
|
||||||
|
int loops = 1;
|
||||||
|
int l;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timeval start, stop;
|
||||||
|
double time1,timeg;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops);
|
||||||
|
|
||||||
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(m = from; m <= to; m += step)
|
||||||
|
{
|
||||||
|
|
||||||
|
timeg=0;
|
||||||
|
|
||||||
|
fprintf(stderr, " %6dx%d : ", (int)m,(int)m);
|
||||||
|
|
||||||
|
for(j = 0; j < m; j++){
|
||||||
|
for(i = 0; i < m * COMPSIZE; i++){
|
||||||
|
a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
for (l=0; l<loops; l++)
|
||||||
|
{
|
||||||
|
|
||||||
|
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
|
||||||
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
|
||||||
|
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
gettimeofday( &start, (struct timezone *)0);
|
||||||
|
|
||||||
|
SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
|
||||||
|
|
||||||
|
gettimeofday( &stop, (struct timezone *)0);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
|
||||||
|
|
||||||
|
timeg += time1;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
timeg /= loops;
|
||||||
|
|
||||||
|
fprintf(stderr,
|
||||||
|
" %10.2f MFlops\n",
|
||||||
|
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -187,7 +187,7 @@ int main(int argc, char *argv[]){
|
||||||
timeg /= loops;
|
timeg /= loops;
|
||||||
|
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" %10.2f MBytes %10.6f sec\n",
|
" %10.2f MFlops %10.6f sec\n",
|
||||||
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg);
|
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m / timeg * 1.e-6, timeg);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -196,7 +196,7 @@ int main(int argc, char *argv[]){
|
||||||
timeg /= loops;
|
timeg /= loops;
|
||||||
|
|
||||||
fprintf(stderr,
|
fprintf(stderr,
|
||||||
" %10.2f MBytes %10.6f sec\n",
|
" %10.2f MFlops %10.6f sec\n",
|
||||||
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg);
|
COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6, timeg);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,172 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#undef TPMV
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define TPMV BLASFUNC(dtpmv)
|
||||||
|
#else
|
||||||
|
#define TPMV BLASFUNC(stpmv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define TPMV BLASFUNC(ztpmv)
|
||||||
|
#else
|
||||||
|
#define TPMV BLASFUNC(ctpmv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size)
|
||||||
|
{
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1) {
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
|
||||||
|
FLOAT *a, *x;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
char uplo ='U';
|
||||||
|
char trans='N';
|
||||||
|
char diag ='U';
|
||||||
|
|
||||||
|
int loops = 1;
|
||||||
|
int l;
|
||||||
|
blasint inc_x=1;
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||||
|
|
||||||
|
blasint n, i, j;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||||
|
double time1, timeg;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
|
||||||
|
to, step, uplo, trans, diag, loops, inc_x);
|
||||||
|
|
||||||
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(n = from; n <= to; n += step) {
|
||||||
|
timeg=0;
|
||||||
|
|
||||||
|
fprintf(stderr, " %6d : ", (int)n);
|
||||||
|
for(j = 0; j < n; j++) {
|
||||||
|
for(i = 0; i < n * COMPSIZE; i++) {
|
||||||
|
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
|
||||||
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (l = 0; l < loops; l++) {
|
||||||
|
clock_gettime(CLOCK_REALTIME, &start);
|
||||||
|
TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
|
||||||
|
clock_gettime(CLOCK_REALTIME, &stop);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
|
||||||
|
timeg += time1;
|
||||||
|
}
|
||||||
|
|
||||||
|
timeg /= loops;
|
||||||
|
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
|
||||||
|
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -0,0 +1,172 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2014, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#ifdef __CYGWIN32__
|
||||||
|
#include <sys/time.h>
|
||||||
|
#endif
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
#undef TPSV
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define TPSV BLASFUNC(dtpsv)
|
||||||
|
#else
|
||||||
|
#define TPSV BLASFUNC(stpsv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#ifdef DOUBLE
|
||||||
|
#define TPSV BLASFUNC(ztpsv)
|
||||||
|
#else
|
||||||
|
#define TPSV BLASFUNC(ctpsv)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
|
||||||
|
|
||||||
|
static void *huge_malloc(BLASLONG size)
|
||||||
|
{
|
||||||
|
int shmid;
|
||||||
|
void *address;
|
||||||
|
|
||||||
|
#ifndef SHM_HUGETLB
|
||||||
|
#define SHM_HUGETLB 04000
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if ((shmid =shmget(IPC_PRIVATE,
|
||||||
|
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
|
||||||
|
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
|
||||||
|
printf( "Memory allocation failed(shmget).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
address = shmat(shmid, NULL, SHM_RND);
|
||||||
|
|
||||||
|
if ((BLASLONG)address == -1) {
|
||||||
|
printf( "Memory allocation failed(shmat).\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
shmctl(shmid, IPC_RMID, 0);
|
||||||
|
|
||||||
|
return address;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define malloc huge_malloc
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char *argv[])
|
||||||
|
{
|
||||||
|
|
||||||
|
FLOAT *a, *x;
|
||||||
|
char *p;
|
||||||
|
|
||||||
|
char uplo ='U';
|
||||||
|
char trans='N';
|
||||||
|
char diag ='U';
|
||||||
|
|
||||||
|
int loops = 1;
|
||||||
|
int l;
|
||||||
|
blasint inc_x=1;
|
||||||
|
|
||||||
|
if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_TRANS"))) trans=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_DIAG"))) diag=*p;
|
||||||
|
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p);
|
||||||
|
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
|
||||||
|
|
||||||
|
blasint n, i, j;
|
||||||
|
|
||||||
|
int from = 1;
|
||||||
|
int to = 200;
|
||||||
|
int step = 1;
|
||||||
|
|
||||||
|
struct timespec start = { 0, 0 }, stop = { 0, 0 };
|
||||||
|
double time1, timeg;
|
||||||
|
|
||||||
|
argc--;argv++;
|
||||||
|
|
||||||
|
if (argc > 0) { from = atol(*argv); argc--; argv++;}
|
||||||
|
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;}
|
||||||
|
if (argc > 0) { step = atol(*argv); argc--; argv++;}
|
||||||
|
|
||||||
|
fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c Trans = %c Diag = %c Loops=%d Inc_x=%d\n", from,
|
||||||
|
to, step, uplo, trans, diag, loops, inc_x);
|
||||||
|
|
||||||
|
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) {
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
|
||||||
|
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef linux
|
||||||
|
srandom(getpid());
|
||||||
|
#endif
|
||||||
|
|
||||||
|
fprintf(stderr, " SIZE Flops\n");
|
||||||
|
|
||||||
|
for(n = from; n <= to; n += step) {
|
||||||
|
timeg=0;
|
||||||
|
|
||||||
|
fprintf(stderr, " %6d : ", (int)n);
|
||||||
|
for(j = 0; j < n; j++) {
|
||||||
|
for(i = 0; i < n * COMPSIZE; i++) {
|
||||||
|
a[(long)i + (long)j * (long)n * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < n * COMPSIZE * abs(inc_x); i++) {
|
||||||
|
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (l = 0; l < loops; l++) {
|
||||||
|
clock_gettime(CLOCK_REALTIME, &start);
|
||||||
|
TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
|
||||||
|
clock_gettime(CLOCK_REALTIME, &stop);
|
||||||
|
|
||||||
|
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
|
||||||
|
timeg += time1;
|
||||||
|
}
|
||||||
|
|
||||||
|
timeg /= loops;
|
||||||
|
fprintf(stderr, " %10.2f MFlops %12.9f sec\n",
|
||||||
|
COMPSIZE * COMPSIZE * 1. * (double)n * (double)n / timeg / 1.e6, timeg);
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__")));
|
|
@ -40,8 +40,11 @@ ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
|
||||||
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
|
ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
|
||||||
|
|
||||||
|
|
||||||
|
ifeq ($(NOFORTRAN),1)
|
||||||
|
all ::
|
||||||
|
else
|
||||||
all :: all1 all2 all3
|
all :: all1 all2 all3
|
||||||
|
endif
|
||||||
|
|
||||||
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
|
all1: xscblat1 xdcblat1 xccblat1 xzcblat1
|
||||||
ifndef CROSS
|
ifndef CROSS
|
||||||
|
|
|
@ -77,10 +77,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c
|
||||||
|
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
|
|
@ -6,6 +6,10 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
|
SGEMMITCOPY = sgemm_tcopy_16_skylakex.c
|
||||||
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
SGEMMONCOPY = sgemm_ncopy_4_skylakex.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c
|
DGEMMKERNEL = dgemm_kernel_16x2_skylakex.c
|
||||||
DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c
|
DTRMMKERNEL = dgemm_kernel_16x2_skylakex.c
|
||||||
|
|
|
@ -74,10 +74,10 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = strsm_kernel_8x4_haswell_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = strsm_kernel_8x4_haswell_LT.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = strsm_kernel_8x4_haswell_RN.c
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
STRSMKERNEL_RT = strsm_kernel_8x4_haswell_RT.c
|
||||||
|
|
||||||
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
|
|
@ -0,0 +1,240 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "strsm_kernel_8x4_haswell_L_common.h"
|
||||||
|
|
||||||
|
#define SOLVE_LN_m1n4 \
|
||||||
|
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\
|
||||||
|
SOLVE_m1n4(-4,4) SAVE_b_m1n4(-16,4)\
|
||||||
|
"movq %2,%3;" save_c_m1n4(4)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m1n8 \
|
||||||
|
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\
|
||||||
|
SOLVE_m1n8(-4,4,5) SAVE_b_m1n8(-16,4,5)\
|
||||||
|
"movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m1n12 \
|
||||||
|
"subq $4,%2; movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\
|
||||||
|
SOLVE_m1n12(-4,4,5,6) SAVE_b_m1n12(-16,4,5,6)\
|
||||||
|
"movq %2,%3;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m2n4 \
|
||||||
|
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\
|
||||||
|
SOLVE_loup_m2n4(-8,4)\
|
||||||
|
SOLVE_up_m2n4(-16,4) SAVE_b_m2n4(-32,4)\
|
||||||
|
"movq %2,%3;" save_c_m2n4(4)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m2n8 \
|
||||||
|
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\
|
||||||
|
SOLVE_loup_m2n8(-8,4,5)\
|
||||||
|
SOLVE_up_m2n8(-16,4,5) SAVE_b_m2n8(-32,4,5)\
|
||||||
|
"movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m2n12 \
|
||||||
|
"subq $8,%2; movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\
|
||||||
|
SOLVE_loup_m2n12(-8,4,5,6)\
|
||||||
|
SOLVE_up_m2n12(-16,4,5,6) SAVE_b_m2n12(-32,4,5,6)\
|
||||||
|
"movq %2,%3;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m4n4 \
|
||||||
|
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-8,5) SUBTRACT_m2n4(-16,4)\
|
||||||
|
SOLVE_up_m2n4(-24,5) SUBTRACT_m2n4(-32,4) SAVE_b_m2n4(-32,5)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-48,4)\
|
||||||
|
SOLVE_up_m2n4(-64,4) SAVE_b_m2n4(-64,4)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m4n4(4,5)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m4n8 \
|
||||||
|
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-8,5,7) SUBTRACT_m2n8(-16,4,6)\
|
||||||
|
SOLVE_up_m2n8(-24,5,7) SUBTRACT_m2n8(-32,4,6) SAVE_b_m2n8(-32,5,7)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-48,4,6)\
|
||||||
|
SOLVE_up_m2n8(-64,4,6) SAVE_b_m2n8(-64,4,6)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m4n12 \
|
||||||
|
"subq $16,%2; movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-8,5,7,9) SUBTRACT_m2n12(-16,4,6,8)\
|
||||||
|
SOLVE_up_m2n12(-24,5,7,9) SUBTRACT_m2n12(-32,4,6,8) SAVE_b_m2n12(-32,5,7,9)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-48,4,6,8)\
|
||||||
|
SOLVE_up_m2n12(-64,4,6,8) SAVE_b_m2n12(-64,4,6,8)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m8n4 \
|
||||||
|
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-8,7) SUBTRACT_m2n4(-16,6) SUBTRACT_m2n4(-24,5) SUBTRACT_m2n4(-32,4)\
|
||||||
|
SOLVE_up_m2n4(-40,7) SUBTRACT_m2n4(-48,6) SUBTRACT_m2n4(-56,5) SUBTRACT_m2n4(-64,4) SAVE_b_m2n4(-32,7)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-80,6) SUBTRACT_m2n4(-88,5) SUBTRACT_m2n4(-96,4)\
|
||||||
|
SOLVE_up_m2n4(-112,6) SUBTRACT_m2n4(-120,5) SUBTRACT_m2n4(-128,4) SAVE_b_m2n4(-64,6)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-152,5) SUBTRACT_m2n4(-160,4)\
|
||||||
|
SOLVE_up_m2n4(-184,5) SUBTRACT_m2n4(-192,4) SAVE_b_m2n4(-96,5)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n4(-224,4)\
|
||||||
|
SOLVE_up_m2n4(-256,4) SAVE_b_m2n4(-128,4)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m8n4(4,5,6,7)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m8n8 \
|
||||||
|
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-8,7,11) SUBTRACT_m2n8(-16,6,10) SUBTRACT_m2n8(-24,5,9) SUBTRACT_m2n8(-32,4,8)\
|
||||||
|
SOLVE_up_m2n8(-40,7,11) SUBTRACT_m2n8(-48,6,10) SUBTRACT_m2n8(-56,5,9) SUBTRACT_m2n8(-64,4,8) SAVE_b_m2n8(-32,7,11)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-80,6,10) SUBTRACT_m2n8(-88,5,9) SUBTRACT_m2n8(-96,4,8)\
|
||||||
|
SOLVE_up_m2n8(-112,6,10) SUBTRACT_m2n8(-120,5,9) SUBTRACT_m2n8(-128,4,8) SAVE_b_m2n8(-64,6,10)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-152,5,9) SUBTRACT_m2n8(-160,4,8)\
|
||||||
|
SOLVE_up_m2n8(-184,5,9) SUBTRACT_m2n8(-192,4,8) SAVE_b_m2n8(-96,5,9)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n8(-224,4,8)\
|
||||||
|
SOLVE_up_m2n8(-256,4,8) SAVE_b_m2n8(-128,4,8)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11)
|
||||||
|
|
||||||
|
#define SOLVE_LN_m8n12 \
|
||||||
|
"subq $32,%2; movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,-32) GEMM_SUM_REORDER_8x4(8,9,10,11,-32) GEMM_SUM_REORDER_8x4(12,13,14,15,-32)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-8,7,11,15) SUBTRACT_m2n12(-16,6,10,14) SUBTRACT_m2n12(-24,5,9,13) SUBTRACT_m2n12(-32,4,8,12)\
|
||||||
|
SOLVE_up_m2n12(-40,7,11,15) SUBTRACT_m2n12(-48,6,10,14) SUBTRACT_m2n12(-56,5,9,13) SUBTRACT_m2n12(-64,4,8,12) SAVE_b_m2n12(-32,7,11,15)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-80,6,10,14) SUBTRACT_m2n12(-88,5,9,13) SUBTRACT_m2n12(-96,4,8,12)\
|
||||||
|
SOLVE_up_m2n12(-112,6,10,14) SUBTRACT_m2n12(-120,5,9,13) SUBTRACT_m2n12(-128,4,8,12) SAVE_b_m2n12(-64,6,10,14)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-152,5,9,13) SUBTRACT_m2n12(-160,4,8,12)\
|
||||||
|
SOLVE_up_m2n12(-184,5,9,13) SUBTRACT_m2n12(-192,4,8,12) SAVE_b_m2n12(-96,5,9,13)\
|
||||||
|
\
|
||||||
|
SOLVE_loup_m2n12(-224,4,8,12)\
|
||||||
|
SOLVE_up_m2n12(-256,4,8,12) SAVE_b_m2n12(-128,4,8,12)\
|
||||||
|
\
|
||||||
|
"movq %2,%3;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15)
|
||||||
|
|
||||||
|
/* r13 = k-kk, r14 = b_tail, r15 = a_tail */
|
||||||
|
|
||||||
|
#define GEMM_LN_SIMPLE(mdim,ndim) \
|
||||||
|
"movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,"#mdim"),%%r15; negq %%r12;"\
|
||||||
|
"movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\
|
||||||
|
"testq %5,%5; jz 2"#mdim""#ndim"2f;"\
|
||||||
|
"2"#mdim""#ndim"1:\n\t"\
|
||||||
|
"subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 2"#mdim""#ndim"1b;"\
|
||||||
|
"2"#mdim""#ndim"2:\n\t"
|
||||||
|
#define GEMM_LN_m8n4 GEMM_LN_SIMPLE(8,4)
|
||||||
|
#define GEMM_LN_m8n8 GEMM_LN_SIMPLE(8,8)
|
||||||
|
#define GEMM_LN_m8n12 \
|
||||||
|
"movq %%r15,%0; negq %%r12; leaq (%%r15,%%r12,8),%%r15; negq %%r12; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\
|
||||||
|
"cmpq $8,%5; jb 28122f;"\
|
||||||
|
"28121:\n\t"\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $8,%5; cmpq $8,%5; jnb 28121b;"\
|
||||||
|
"28122:\n\t"\
|
||||||
|
"testq %5,%5; jz 28124f;"\
|
||||||
|
"28123:\n\t"\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 28123b;"\
|
||||||
|
"28124:\n\t"
|
||||||
|
#define GEMM_LN_m4n4 GEMM_LN_SIMPLE(4,4)
|
||||||
|
#define GEMM_LN_m4n8 GEMM_LN_SIMPLE(4,8)
|
||||||
|
#define GEMM_LN_m4n12 GEMM_LN_SIMPLE(4,12)
|
||||||
|
#define GEMM_LN_m2n4 GEMM_LN_SIMPLE(2,4)
|
||||||
|
#define GEMM_LN_m2n8 GEMM_LN_SIMPLE(2,8)
|
||||||
|
#define GEMM_LN_m2n12 GEMM_LN_SIMPLE(2,12)
|
||||||
|
#define GEMM_LN_m1n4 GEMM_LN_SIMPLE(1,4)
|
||||||
|
#define GEMM_LN_m1n8 GEMM_LN_SIMPLE(1,8)
|
||||||
|
#define GEMM_LN_m1n12 GEMM_LN_SIMPLE(1,12)
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
c_ptr += M;\
|
||||||
|
__asm__ __volatile__(\
|
||||||
|
"movq %0,%%r15; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; leaq (%1,%%r12,4),%%r14; movq %10,%%r11;"\
|
||||||
|
"testq $1,%%r11; jz "#ndim"772f;"\
|
||||||
|
#ndim"771:\n\t"\
|
||||||
|
GEMM_LN_m1n##ndim SOLVE_LN_m1n##ndim "subq $1,%%r11;"\
|
||||||
|
#ndim"772:\n\t"\
|
||||||
|
"testq $2,%%r11; jz "#ndim"773f;"\
|
||||||
|
GEMM_LN_m2n##ndim SOLVE_LN_m2n##ndim "subq $2,%%r11;"\
|
||||||
|
#ndim"773:\n\t"\
|
||||||
|
"testq $4,%%r11; jz "#ndim"774f;"\
|
||||||
|
GEMM_LN_m4n##ndim SOLVE_LN_m4n##ndim "subq $4,%%r11;"\
|
||||||
|
#ndim"774:\n\t"\
|
||||||
|
"testq %%r11,%%r11; jz "#ndim"776f;"\
|
||||||
|
#ndim"775:\n\t"\
|
||||||
|
GEMM_LN_m8n##ndim SOLVE_LN_m8n##ndim "subq $8,%%r11; jnz "#ndim"775b;"\
|
||||||
|
#ndim"776:\n\t"\
|
||||||
|
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(kmkkinp),"m"(one[0]),"m"(zero[0]),"m"(M)\
|
||||||
|
:"r11","r12","r13","r14","r15","cc","memory",\
|
||||||
|
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr += M * K; b_ptr += (ndim-4) * K; c_ptr += ldc * ndim;\
|
||||||
|
}
|
||||||
|
static void solve_LN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||||
|
FLOAT a0, b0;
|
||||||
|
int i, j, k;
|
||||||
|
for (i=m-1;i>=0;i--) {
|
||||||
|
a0 = a[i*m+i]; //reciprocal of the original value
|
||||||
|
for (j=0;j<n;j++) {
|
||||||
|
b0 = c[j*ldc+i]*a0;
|
||||||
|
c[j*ldc+i] = b[i*n+j] = b0;
|
||||||
|
for (k=0;k<i;k++) c[j*ldc+k] -= b0*a[i*m+k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
|
||||||
|
BLASLONG m_count = m, kk = m+offset; FLOAT *a_ptr = sa+m*k, *c_ptr = C+m;
|
||||||
|
if(m_count&1){
|
||||||
|
a_ptr-=k; c_ptr--;
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_LN(1,n,a_ptr+(kk-1)*1,sb+(kk-1)*n,c_ptr,ldc);
|
||||||
|
kk -= 1;
|
||||||
|
m_count--;
|
||||||
|
}
|
||||||
|
if(m_count&2){
|
||||||
|
a_ptr-=k*2; c_ptr-=2;
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_LN(2,n,a_ptr+(kk-2)*2,sb+(kk-2)*n,c_ptr,ldc);
|
||||||
|
kk -= 2;
|
||||||
|
m_count-=2;
|
||||||
|
}
|
||||||
|
if(m_count&4){
|
||||||
|
a_ptr-=k*4; c_ptr-=4;
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_LN(4,n,a_ptr+(kk-4)*4,sb+(kk-4)*n,c_ptr,ldc);
|
||||||
|
kk -= 4;
|
||||||
|
m_count-=4;
|
||||||
|
}
|
||||||
|
for(;m_count>7;m_count-=8){
|
||||||
|
a_ptr-=k*8; c_ptr-=8;
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_LN(8,n,a_ptr+(kk-8)*8,sb+(kk-8)*n,c_ptr,ldc);
|
||||||
|
kk -= 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
|
||||||
|
float *a_ptr = sa+m*k, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
|
||||||
|
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
|
||||||
|
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
|
||||||
|
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, kmkkinp = (uint64_t)(k-m-offset), k_cnt = 0;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,sa,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;}
|
||||||
|
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,sa,b_ptr,c_ptr,ldc,k,offset);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,228 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "strsm_kernel_8x4_haswell_L_common.h"
|
||||||
|
|
||||||
|
#define SOLVE_LT_m1n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4)\
|
||||||
|
SOLVE_m1n4(0,4) SAVE_b_m1n4(0,4)\
|
||||||
|
"movq %2,%3; addq $4,%2;" save_c_m1n4(4)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m1n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5)\
|
||||||
|
SOLVE_m1n8(0,4,5) SAVE_b_m1n8(0,4,5)\
|
||||||
|
"movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m1n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6)\
|
||||||
|
SOLVE_m1n12(0,4,5,6) SAVE_b_m1n12(0,4,5,6)\
|
||||||
|
"movq %2,%3; addq $4,%2;" save_c_m1n4(4) save_c_m1n4(5) save_c_m1n4(6)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m2n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4)\
|
||||||
|
SOLVE_uplo_m2n4(0,4)\
|
||||||
|
SOLVE_lo_m2n4(8,4) SAVE_b_m2n4(0,4)\
|
||||||
|
"movq %2,%3; addq $8,%2;" save_c_m2n4(4)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m2n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5)\
|
||||||
|
SOLVE_uplo_m2n8(0,4,5)\
|
||||||
|
SOLVE_lo_m2n8(8,4,5) SAVE_b_m2n8(0,4,5)\
|
||||||
|
"movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m2n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5,4) GEMM_SUM_REORDER_2x4(6,7,5) GEMM_SUM_REORDER_2x4(8,9,6)\
|
||||||
|
SOLVE_uplo_m2n12(0,4,5,6)\
|
||||||
|
SOLVE_lo_m2n12(8,4,5,6) SAVE_b_m2n12(0,4,5,6)\
|
||||||
|
"movq %2,%3; addq $8,%2;" save_c_m2n4(4) save_c_m2n4(5) save_c_m2n4(6)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m4n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5)\
|
||||||
|
SOLVE_lo_m2n4(16,4) SUBTRACT_m2n4(24,5) SAVE_b_m2n4(0,4)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(40,5)\
|
||||||
|
SOLVE_lo_m2n4(56,5) SAVE_b_m2n4(32,5)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m4n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(0,4,6) SUBTRACT_m2n8(8,5,7)\
|
||||||
|
SOLVE_lo_m2n8(16,4,6) SUBTRACT_m2n8(24,5,7) SAVE_b_m2n8(0,4,6)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(40,5,7)\
|
||||||
|
SOLVE_lo_m2n8(56,5,7) SAVE_b_m2n8(32,5,7)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m4n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(0,4,6,8) SUBTRACT_m2n12(8,5,7,9)\
|
||||||
|
SOLVE_lo_m2n12(16,4,6,8) SUBTRACT_m2n12(24,5,7,9) SAVE_b_m2n12(0,4,6,8)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(40,5,7,9)\
|
||||||
|
SOLVE_lo_m2n12(56,5,7,9) SAVE_b_m2n12(32,5,7,9)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $16,%2;" save_c_m4n4(4,5) save_c_m4n4(6,7) save_c_m4n4(8,9)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m8n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(0,4) SUBTRACT_m2n4(8,5) SUBTRACT_m2n4(16,6) SUBTRACT_m2n4(24,7)\
|
||||||
|
SOLVE_lo_m2n4(32,4) SUBTRACT_m2n4(40,5) SUBTRACT_m2n4(48,6) SUBTRACT_m2n4(56,7) SAVE_b_m2n4(0,4)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(72,5) SUBTRACT_m2n4(80,6) SUBTRACT_m2n4(88,7)\
|
||||||
|
SOLVE_lo_m2n4(104,5) SUBTRACT_m2n4(112,6) SUBTRACT_m2n4(120,7) SAVE_b_m2n4(32,5)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(144,6) SUBTRACT_m2n4(152,7)\
|
||||||
|
SOLVE_lo_m2n4(176,6) SUBTRACT_m2n4(184,7) SAVE_b_m2n4(64,6)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n4(216,7)\
|
||||||
|
SOLVE_lo_m2n4(248,7) SAVE_b_m2n4(96,7)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m8n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(0,4,8) SUBTRACT_m2n8(8,5,9) SUBTRACT_m2n8(16,6,10) SUBTRACT_m2n8(24,7,11)\
|
||||||
|
SOLVE_lo_m2n8(32,4,8) SUBTRACT_m2n8(40,5,9) SUBTRACT_m2n8(48,6,10) SUBTRACT_m2n8(56,7,11) SAVE_b_m2n8(0,4,8)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(72,5,9) SUBTRACT_m2n8(80,6,10) SUBTRACT_m2n8(88,7,11)\
|
||||||
|
SOLVE_lo_m2n8(104,5,9) SUBTRACT_m2n8(112,6,10) SUBTRACT_m2n8(120,7,11) SAVE_b_m2n8(32,5,9)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(144,6,10) SUBTRACT_m2n8(152,7,11)\
|
||||||
|
SOLVE_lo_m2n8(176,6,10) SUBTRACT_m2n8(184,7,11) SAVE_b_m2n8(64,6,10)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n8(216,7,11)\
|
||||||
|
SOLVE_lo_m2n8(248,7,11) SAVE_b_m2n8(96,7,11)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11)
|
||||||
|
|
||||||
|
#define SOLVE_LT_m8n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(0,4,8,12) SUBTRACT_m2n12(8,5,9,13) SUBTRACT_m2n12(16,6,10,14) SUBTRACT_m2n12(24,7,11,15)\
|
||||||
|
SOLVE_lo_m2n12(32,4,8,12) SUBTRACT_m2n12(40,5,9,13) SUBTRACT_m2n12(48,6,10,14) SUBTRACT_m2n12(56,7,11,15) SAVE_b_m2n12(0,4,8,12)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(72,5,9,13) SUBTRACT_m2n12(80,6,10,14) SUBTRACT_m2n12(88,7,11,15)\
|
||||||
|
SOLVE_lo_m2n12(104,5,9,13) SUBTRACT_m2n12(112,6,10,14) SUBTRACT_m2n12(120,7,11,15) SAVE_b_m2n12(32,5,9,13)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(144,6,10,14) SUBTRACT_m2n12(152,7,11,15)\
|
||||||
|
SOLVE_lo_m2n12(176,6,10,14) SUBTRACT_m2n12(184,7,11,15) SAVE_b_m2n12(64,6,10,14)\
|
||||||
|
\
|
||||||
|
SOLVE_uplo_m2n12(216,7,11,15)\
|
||||||
|
SOLVE_lo_m2n12(248,7,11,15) SAVE_b_m2n12(96,7,11,15)\
|
||||||
|
\
|
||||||
|
"movq %2,%3; addq $32,%2;" save_c_m8n4(4,5,6,7) save_c_m8n4(8,9,10,11) save_c_m8n4(12,13,14,15)
|
||||||
|
|
||||||
|
#define GEMM_LT_SIMPLE(mdim,ndim) \
|
||||||
|
"movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; addq $"#mdim",%%r13; movq %%r14,%1;" INIT_m##mdim##n##ndim\
|
||||||
|
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
|
||||||
|
"1"#mdim""#ndim"1:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
|
||||||
|
"1"#mdim""#ndim"2:\n\t"
|
||||||
|
#define GEMM_LT_m8n4 GEMM_LT_SIMPLE(8,4)
|
||||||
|
#define GEMM_LT_m8n8 GEMM_LT_SIMPLE(8,8)
|
||||||
|
#define GEMM_LT_m8n12 \
|
||||||
|
"movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; addq $8,%%r13; movq %%r14,%1;" INIT_m8n12\
|
||||||
|
"cmpq $8,%5; jb 18122f;"\
|
||||||
|
"18121:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
|
||||||
|
"18122:\n\t"\
|
||||||
|
"testq %5,%5; jz 18124f;"\
|
||||||
|
"18123:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
|
||||||
|
"18124:\n\t"
|
||||||
|
#define GEMM_LT_m4n4 GEMM_LT_SIMPLE(4,4)
|
||||||
|
#define GEMM_LT_m4n8 GEMM_LT_SIMPLE(4,8)
|
||||||
|
#define GEMM_LT_m4n12 GEMM_LT_SIMPLE(4,12)
|
||||||
|
#define GEMM_LT_m2n4 GEMM_LT_SIMPLE(2,4)
|
||||||
|
#define GEMM_LT_m2n8 GEMM_LT_SIMPLE(2,8)
|
||||||
|
#define GEMM_LT_m2n12 GEMM_LT_SIMPLE(2,12)
|
||||||
|
#define GEMM_LT_m1n4 GEMM_LT_SIMPLE(1,4)
|
||||||
|
#define GEMM_LT_m1n8 GEMM_LT_SIMPLE(1,8)
|
||||||
|
#define GEMM_LT_m1n12 GEMM_LT_SIMPLE(1,12)
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
__asm__ __volatile__(\
|
||||||
|
"movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
|
||||||
|
"cmpq $8,%%r11; jb "#ndim"772f;"\
|
||||||
|
#ndim"771:\n\t"\
|
||||||
|
GEMM_LT_m8n##ndim SOLVE_LT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
|
||||||
|
#ndim"772:\n\t"\
|
||||||
|
"testq $4,%%r11; jz "#ndim"773f;"\
|
||||||
|
GEMM_LT_m4n##ndim SOLVE_LT_m4n##ndim "subq $4,%%r11;"\
|
||||||
|
#ndim"773:\n\t"\
|
||||||
|
"testq $2,%%r11; jz "#ndim"774f;"\
|
||||||
|
GEMM_LT_m2n##ndim SOLVE_LT_m2n##ndim "subq $2,%%r11;"\
|
||||||
|
#ndim"774:\n\t"\
|
||||||
|
"testq $1,%%r11; jz "#ndim"775f;"\
|
||||||
|
GEMM_LT_m1n##ndim SOLVE_LT_m1n##ndim "subq $1,%%r11;"\
|
||||||
|
#ndim"775:\n\t"\
|
||||||
|
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
|
||||||
|
:"r11","r12","r13","r14","r15","cc","memory",\
|
||||||
|
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M;\
|
||||||
|
}
|
||||||
|
static void solve_LT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||||
|
FLOAT a0, b0;
|
||||||
|
int i, j, k;
|
||||||
|
for (i=0;i<m;i++) {
|
||||||
|
a0 = a[i*m+i];
|
||||||
|
for (j=0;j<n;j++) {
|
||||||
|
b0 = c[j*ldc+i] * a0;
|
||||||
|
b[i*n+j] = c[j*ldc+i] = b0;
|
||||||
|
for (k=i+1;k<m;k++) c[j*ldc+k] -= b0 * a[i*m+k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
|
||||||
|
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
|
||||||
|
for(;m_count>7;m_count-=8){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_LT(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
|
||||||
|
kk += 8; a_ptr += k * 8; c_ptr += 8;
|
||||||
|
}
|
||||||
|
for(;m_count>3;m_count-=4){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_LT(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
|
||||||
|
kk += 4; a_ptr += k * 4; c_ptr += 4;
|
||||||
|
}
|
||||||
|
for(;m_count>1;m_count-=2){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_LT(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
|
||||||
|
kk += 2; a_ptr += k * 2; c_ptr += 2;
|
||||||
|
}
|
||||||
|
if(m_count>0){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_LT(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
|
||||||
|
kk += 1; a_ptr += k * 1; c_ptr += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
|
||||||
|
float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
|
||||||
|
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
|
||||||
|
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
|
||||||
|
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)offset, k_cnt = 0;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,offset); b_ptr += 2*k; c_ptr += ldc*2;}
|
||||||
|
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,offset);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,187 @@
|
||||||
|
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
|
||||||
|
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
|
||||||
|
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
|
||||||
|
|
||||||
|
#define init_m8n4(c1,c2,c3,c4)\
|
||||||
|
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2"; vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
|
||||||
|
#define INIT_m8n4 init_m8n4(4,5,6,7)
|
||||||
|
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
|
||||||
|
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
|
||||||
|
|
||||||
|
#define init_m4n4(c1,c2,c3,c4)\
|
||||||
|
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2"; vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
|
||||||
|
#define INIT_m4n4 init_m4n4(4,5,6,7)
|
||||||
|
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
|
||||||
|
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
|
||||||
|
|
||||||
|
#define init_m2n4(c1,c2)\
|
||||||
|
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
|
||||||
|
#define INIT_m2n4 init_m2n4(4,5)
|
||||||
|
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
|
||||||
|
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
|
||||||
|
|
||||||
|
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
|
||||||
|
#define INIT_m1n4 init_m1n4(4)
|
||||||
|
#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
|
||||||
|
#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m8n4 \
|
||||||
|
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
|
||||||
|
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
|
||||||
|
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
|
||||||
|
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
|
||||||
|
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
|
||||||
|
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
|
||||||
|
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
|
||||||
|
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
|
||||||
|
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m4n4 \
|
||||||
|
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
|
||||||
|
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
|
||||||
|
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
|
||||||
|
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
|
||||||
|
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
|
||||||
|
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
|
||||||
|
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
|
||||||
|
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
|
||||||
|
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m2n4 \
|
||||||
|
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
|
||||||
|
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
|
||||||
|
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
|
||||||
|
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
|
||||||
|
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
|
||||||
|
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
|
||||||
|
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
|
||||||
|
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
|
||||||
|
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
|
||||||
|
"vunpcklps %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhps %%ymm"#c4",%%ymm"#c3",%%ymm3;"\
|
||||||
|
"vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\
|
||||||
|
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\
|
||||||
|
"vaddps %%ymm0,%%ymm"#c3",%%ymm0; vaddps %%ymm1,%%ymm"#c4",%%ymm1;"\
|
||||||
|
"leaq (%3,%4,2),%3;"\
|
||||||
|
"vmovups (%3),%%ymm"#c1"; vmovups (%3,%4,1),%%ymm"#c2"; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1);"\
|
||||||
|
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c3"; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm"#c4";"\
|
||||||
|
"vaddps %%ymm2,%%ymm"#c3",%%ymm2; vaddps %%ymm3,%%ymm"#c4",%%ymm3;"\
|
||||||
|
"leaq (%3,%4,2),%3;"\
|
||||||
|
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\
|
||||||
|
"vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
|
||||||
|
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
|
||||||
|
"vunpcklps %%xmm"#c4",%%xmm"#c3",%%xmm2; vunpckhps %%xmm"#c4",%%xmm"#c3",%%xmm3;"\
|
||||||
|
"vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\
|
||||||
|
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\
|
||||||
|
"vaddps %%xmm0,%%xmm"#c3",%%xmm0; vaddps %%xmm1,%%xmm"#c4",%%xmm1;"\
|
||||||
|
"leaq (%3,%4,2),%3;"\
|
||||||
|
"vmovups (%3),%%xmm"#c1"; vmovups (%3,%4,1),%%xmm"#c2";"\
|
||||||
|
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c3"; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm"#c4";"\
|
||||||
|
"vaddps %%xmm2,%%xmm"#c3",%%xmm2; vaddps %%xmm3,%%xmm"#c4",%%xmm3;"\
|
||||||
|
"leaq (%3,%4,2),%3;"\
|
||||||
|
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#co1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#co2";"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_2x4(c1,c2,co1)\
|
||||||
|
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
|
||||||
|
"vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm0,%%xmm2,%%xmm0; leaq (%3,%4,2),%3;"\
|
||||||
|
"vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vaddps %%xmm1,%%xmm2,%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vperm2f128 $2,%%ymm0,%%ymm1,%%ymm"#co1";"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_1x4(c1)\
|
||||||
|
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define save_c_m8n4(c1,c2,c3,c4)\
|
||||||
|
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
|
||||||
|
"vunpcklpd %%ymm"#c4",%%ymm"#c3",%%ymm2; vunpckhpd %%ymm"#c4",%%ymm"#c3",%%ymm3;"\
|
||||||
|
"vperm2f128 $2,%%ymm0,%%ymm2,%%ymm"#c1"; vperm2f128 $2,%%ymm1,%%ymm3,%%ymm"#c2";"\
|
||||||
|
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vperm2f128 $19,%%ymm0,%%ymm2,%%ymm"#c3"; vperm2f128 $19,%%ymm1,%%ymm3,%%ymm"#c4";"\
|
||||||
|
"vmovups %%ymm"#c3",(%3); vmovups %%ymm"#c4",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define save_c_m4n4(c1,c2)\
|
||||||
|
"vunpcklpd %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhpd %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
|
||||||
|
"vmovups %%xmm0,(%3); vmovups %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vextractf128 $1,%%ymm0,(%3); vextractf128 $1,%%ymm1,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define save_c_m2n4(c1)\
|
||||||
|
"vextractf128 $1,%%ymm"#c1",%%xmm1; vmovsd %%xmm"#c1",(%3); vmovhpd %%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vmovsd %%xmm1,(%3); vmovhpd %%xmm1,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define save_c_m1n4(c1)\
|
||||||
|
"vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define SOLVE_up_m2n4(a_off,c1)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
|
||||||
|
"vmovsldup %%ymm"#c1",%%ymm1;"
|
||||||
|
|
||||||
|
#define SOLVE_up_m2n8(a_off,c1,c2)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
|
||||||
|
|
||||||
|
#define SOLVE_up_m2n12(a_off,c1,c2,c3)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\
|
||||||
|
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2; vmovsldup %%ymm"#c3",%%ymm3;"
|
||||||
|
|
||||||
|
#define SOLVE_uplo_m2n4(a_off,c1) SOLVE_up_m2n4(a_off,c1)\
|
||||||
|
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_uplo_m2n8(a_off,c1,c2) SOLVE_up_m2n8(a_off,c1,c2)\
|
||||||
|
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_uplo_m2n12(a_off,c1,c2,c3) SOLVE_up_m2n12(a_off,c1,c2,c3)\
|
||||||
|
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
|
||||||
|
|
||||||
|
#define SOLVE_lo_m2n4(a_off,c1)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
|
||||||
|
"vmovshdup %%ymm"#c1",%%ymm1;"
|
||||||
|
|
||||||
|
#define SOLVE_lo_m2n8(a_off,c1,c2)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
|
||||||
|
|
||||||
|
#define SOLVE_lo_m2n12(a_off,c1,c2,c3)\
|
||||||
|
"vbroadcastsd "#a_off"(%0),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2"; vmulps %%ymm2,%%ymm"#c3",%%ymm"#c3";"\
|
||||||
|
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2; vmovshdup %%ymm"#c3",%%ymm3;"
|
||||||
|
|
||||||
|
#define SOLVE_loup_m2n4(a_off,c1) SOLVE_lo_m2n4(a_off,c1)\
|
||||||
|
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_loup_m2n8(a_off,c1,c2) SOLVE_lo_m2n8(a_off,c1,c2)\
|
||||||
|
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_loup_m2n12(a_off,c1,c2,c3) SOLVE_lo_m2n12(a_off,c1,c2,c3)\
|
||||||
|
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2"; vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
|
||||||
|
|
||||||
|
#define SOLVE_m1n4(a_off,c1) "vbroadcastss "#a_off"(%0),%%xmm0; vmulps %%xmm0,%%xmm"#c1",%%xmm"#c1";"
|
||||||
|
#define SOLVE_m1n8(a_off,c1,c2) SOLVE_m1n4(a_off,c1) "vmulps %%xmm0,%%xmm"#c2",%%xmm"#c2";"
|
||||||
|
#define SOLVE_m1n12(a_off,c1,c2,c3) SOLVE_m1n8(a_off,c1,c2) "vmulps %%xmm0,%%xmm"#c3",%%xmm"#c3";"
|
||||||
|
|
||||||
|
#define SUBTRACT_m2n4(a_off,c1) "vbroadcastsd "#a_off"(%0),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
#define SUBTRACT_m2n8(a_off,c1,c2) SUBTRACT_m2n4(a_off,c1) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
#define SUBTRACT_m2n12(a_off,c1,c2,c3) SUBTRACT_m2n8(a_off,c1,c2) "vfnmadd231ps %%ymm0,%%ymm3,%%ymm"#c3";"
|
||||||
|
|
||||||
|
#define save_b_m2n4(c1,tmp,b_off,...)\
|
||||||
|
"vpermilps $216,%%ymm"#c1",%%ymm"#tmp"; vpermpd $216,%%ymm"#tmp",%%ymm"#tmp"; vmovups %%ymm"#tmp","#b_off"("#__VA_ARGS__");"
|
||||||
|
|
||||||
|
#define SAVE_b_m2n4(b_off,c1) save_b_m2n4(c1,1,b_off,%1)
|
||||||
|
#define SAVE_b_m2n8(b_off,c1,c2) SAVE_b_m2n4(b_off,c1) save_b_m2n4(c2,2,b_off,%1,%%r12,4)
|
||||||
|
#define SAVE_b_m2n12(b_off,c1,c2,c3) SAVE_b_m2n8(b_off,c1,c2) save_b_m2n4(c3,3,b_off,%1,%%r12,8)
|
||||||
|
|
||||||
|
#define SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c1","#b_off"(%1);"
|
||||||
|
#define SAVE_b_m1n8(b_off,c1,c2) SAVE_b_m1n4(b_off,c1) "vmovups %%xmm"#c2","#b_off"(%1,%%r12,4);"
|
||||||
|
#define SAVE_b_m1n12(b_off,c1,c2,c3) SAVE_b_m1n8(b_off,c1,c2) "vmovups %%xmm"#c3","#b_off"(%1,%%r12,8);"
|
||||||
|
|
|
@ -0,0 +1,279 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "strsm_kernel_8x4_haswell_R_common.h"
|
||||||
|
|
||||||
|
#define SOLVE_RN_m8n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\
|
||||||
|
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\
|
||||||
|
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,0)\
|
||||||
|
SOLVE_leri_m8n2(40,6,7,%1)\
|
||||||
|
SOLVE_ri_m8n2(56,6,7,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,64)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m8n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\
|
||||||
|
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,0)\
|
||||||
|
SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,64)\
|
||||||
|
SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m8n2(8,9,128)\
|
||||||
|
SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m8n2(10,11,192)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m8n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\
|
||||||
|
SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,0)\
|
||||||
|
SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,64)\
|
||||||
|
SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(8,9,128)\
|
||||||
|
SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(10,11,192)\
|
||||||
|
SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(12,13,256)\
|
||||||
|
SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m8n2(14,15,320)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m4n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\
|
||||||
|
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\
|
||||||
|
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,0)\
|
||||||
|
SOLVE_leri_m4n2(40,5,%1)\
|
||||||
|
SOLVE_ri_m4n2(56,5,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,32)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m4n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\
|
||||||
|
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,0)\
|
||||||
|
SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,32)\
|
||||||
|
SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m4n2(6,64)\
|
||||||
|
SOLVE_leri_m4n2(104,7,%1,%%r12,4)\
|
||||||
|
SOLVE_ri_m4n2(120,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m4n2(7,96)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m4n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\
|
||||||
|
SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,0)\
|
||||||
|
SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,32)\
|
||||||
|
SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(6,64)\
|
||||||
|
SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(7,96)\
|
||||||
|
SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(8,128)\
|
||||||
|
SOLVE_leri_m4n2(168,9,%1,%%r12,8)\
|
||||||
|
SOLVE_ri_m4n2(184,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m4n2(9,160)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m2n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\
|
||||||
|
SOLVE_col1_ltor_m2n4(0,4,5,%1)\
|
||||||
|
SOLVE_col2_ltor_m2n4(16,4,5,%1)\
|
||||||
|
SOLVE_col3_ltor_m2n4(32,4,5,%1)\
|
||||||
|
SOLVE_col4_ltor_m2n4(48,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,0)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m2n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\
|
||||||
|
SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,0)\
|
||||||
|
SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\
|
||||||
|
SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m2n4(6,7,32)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m2n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\
|
||||||
|
SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,0)\
|
||||||
|
SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m2n4(6,7,32)\
|
||||||
|
SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m2n4(8,9,64)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m1n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\
|
||||||
|
SOLVE_col1_ltor_m1n4(0,4,%1)\
|
||||||
|
SOLVE_col2_ltor_m1n4(16,4,%1)\
|
||||||
|
SOLVE_col3_ltor_m1n4(32,4,%1)\
|
||||||
|
SOLVE_col4_ltor_m1n4(48,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,0)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m1n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\
|
||||||
|
SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,0)\
|
||||||
|
SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\
|
||||||
|
SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\
|
||||||
|
SAVE_SOLUTION_m1n4(5,16)
|
||||||
|
|
||||||
|
#define SOLVE_RN_m1n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\
|
||||||
|
SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,0)\
|
||||||
|
SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m1n4(5,16)\
|
||||||
|
SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\
|
||||||
|
SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\
|
||||||
|
SAVE_SOLUTION_m1n4(6,32)
|
||||||
|
|
||||||
|
#define GEMM_RN_SIMPLE(mdim,ndim) \
|
||||||
|
"movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
|
||||||
|
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
|
||||||
|
"1"#mdim""#ndim"1:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\
|
||||||
|
"1"#mdim""#ndim"2:\n\t"
|
||||||
|
#define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4)
|
||||||
|
#define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8)
|
||||||
|
#define GEMM_RN_m8n12 \
|
||||||
|
"movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
|
||||||
|
"cmpq $8,%5; jb 18122f;"\
|
||||||
|
"18121:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\
|
||||||
|
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
|
||||||
|
"18122:\n\t"\
|
||||||
|
"testq %5,%5; jz 18124f;"\
|
||||||
|
"18123:\n\t"\
|
||||||
|
GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\
|
||||||
|
"18124:\n\t"
|
||||||
|
#define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4)
|
||||||
|
#define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8)
|
||||||
|
#define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12)
|
||||||
|
#define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4)
|
||||||
|
#define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8)
|
||||||
|
#define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12)
|
||||||
|
#define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4)
|
||||||
|
#define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8)
|
||||||
|
#define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12)
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
__asm__ __volatile__(\
|
||||||
|
"movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\
|
||||||
|
"cmpq $8,%%r11; jb "#ndim"772f;"\
|
||||||
|
#ndim"771:\n\t"\
|
||||||
|
GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
|
||||||
|
#ndim"772:\n\t"\
|
||||||
|
"testq $4,%%r11; jz "#ndim"773f;"\
|
||||||
|
GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\
|
||||||
|
#ndim"773:\n\t"\
|
||||||
|
"testq $2,%%r11; jz "#ndim"774f;"\
|
||||||
|
GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\
|
||||||
|
#ndim"774:\n\t"\
|
||||||
|
"testq $1,%%r11; jz "#ndim"775f;"\
|
||||||
|
GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\
|
||||||
|
#ndim"775:\n\t"\
|
||||||
|
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
|
||||||
|
:"r11","r12","r13","r14","r15","cc","memory",\
|
||||||
|
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\
|
||||||
|
}
|
||||||
|
|
||||||
|
static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
|
||||||
|
FLOAT a0, b0;
|
||||||
|
int i, j, k;
|
||||||
|
for (i=0; i<n; i++) {
|
||||||
|
b0 = b[i*n+i];
|
||||||
|
for (j=0; j<m; j++) {
|
||||||
|
a0 = c[i*ldc+j] * b0;
|
||||||
|
a[i*m+j] = c[i*ldc+j] = a0;
|
||||||
|
for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
|
||||||
|
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
|
||||||
|
for(;m_count>7;m_count-=8){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 8; c_ptr += 8;
|
||||||
|
}
|
||||||
|
for(;m_count>3;m_count-=4){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 4; c_ptr += 4;
|
||||||
|
}
|
||||||
|
for(;m_count>1;m_count-=2){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 2; c_ptr += 2;
|
||||||
|
}
|
||||||
|
if(m_count>0){
|
||||||
|
if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc);
|
||||||
|
solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 1; c_ptr += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
|
||||||
|
float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C;
|
||||||
|
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
|
||||||
|
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
|
||||||
|
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;}
|
||||||
|
if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF);
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,281 @@
|
||||||
|
#include "common.h"
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "strsm_kernel_8x4_haswell_R_common.h"
|
||||||
|
|
||||||
|
#define SOLVE_RT_m8n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
|
||||||
|
SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-48,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,-128)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m8n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
|
||||||
|
SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-112,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-128,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,-256)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m8n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\
|
||||||
|
SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m8n2(-176,4,5,%1)\
|
||||||
|
SOLVE_le_m8n2(-192,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m8n2(4,5,-384)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m4n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
|
||||||
|
SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-48,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,-64)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m4n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
|
||||||
|
SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-112,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-128,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,-128)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m4n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\
|
||||||
|
SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\
|
||||||
|
SOLVE_rile_m4n2(-176,4,%1)\
|
||||||
|
SOLVE_le_m4n2(-192,4,%1)\
|
||||||
|
SAVE_SOLUTION_m4n2(4,-192)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m2n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-16,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-32,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-48,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,-32)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m2n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-80,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-96,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-112,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-128,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,-64)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m2n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m2n4(-144,4,5,%1)\
|
||||||
|
SOLVE_col3_rtol_m2n4(-160,4,5,%1)\
|
||||||
|
SOLVE_col2_rtol_m2n4(-176,4,5,%1)\
|
||||||
|
SOLVE_col1_rtol_m2n4(-192,4,5,%1)\
|
||||||
|
SAVE_SOLUTION_m2n4(4,5,-96)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m1n4 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-16,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-32,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-48,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,-16)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m1n8 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-80,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-96,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-112,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-128,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,-32)
|
||||||
|
|
||||||
|
#define SOLVE_RT_m1n12 \
|
||||||
|
"movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\
|
||||||
|
SOLVE_col4_rtol_m1n4(-144,4,%1)\
|
||||||
|
SOLVE_col3_rtol_m1n4(-160,4,%1)\
|
||||||
|
SOLVE_col2_rtol_m1n4(-176,4,%1)\
|
||||||
|
SOLVE_col1_rtol_m1n4(-192,4,%1)\
|
||||||
|
SAVE_SOLUTION_m1n4(4,-48)
|
||||||
|
|
||||||
|
/* r14 = b_tail, r15 = a_tail, r13 = k-kk */
|
||||||
|
#define GEMM_RT_SIMPLE(mdim,ndim) \
|
||||||
|
"leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\
|
||||||
|
"testq %5,%5; jz 1"#mdim""#ndim"2f;"\
|
||||||
|
"1"#mdim""#ndim"1:\n\t"\
|
||||||
|
"subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\
|
||||||
|
"1"#mdim""#ndim"2:\n\t"
|
||||||
|
#define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4)
|
||||||
|
#define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8)
|
||||||
|
#define GEMM_RT_m8n12 \
|
||||||
|
"leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\
|
||||||
|
"cmpq $8,%5; jb 18122f;"\
|
||||||
|
"18121:\n\t"\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\
|
||||||
|
"subq $8,%5; cmpq $8,%5; jnb 18121b;"\
|
||||||
|
"18122:\n\t"\
|
||||||
|
"testq %5,%5; jz 18124f;"\
|
||||||
|
"18123:\n\t"\
|
||||||
|
"subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\
|
||||||
|
"18124:\n\t"
|
||||||
|
#define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4)
|
||||||
|
#define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8)
|
||||||
|
#define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12)
|
||||||
|
#define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4)
|
||||||
|
#define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8)
|
||||||
|
#define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12)
|
||||||
|
#define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4)
|
||||||
|
#define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8)
|
||||||
|
#define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12)
|
||||||
|
|
||||||
|
#define COMPUTE(ndim) {\
|
||||||
|
b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\
|
||||||
|
__asm__ __volatile__(\
|
||||||
|
"movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\
|
||||||
|
"cmpq $8,%%r11; jb "#ndim"772f;"\
|
||||||
|
#ndim"771:\n\t"\
|
||||||
|
GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\
|
||||||
|
#ndim"772:\n\t"\
|
||||||
|
"testq $4,%%r11; jz "#ndim"773f;"\
|
||||||
|
GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\
|
||||||
|
#ndim"773:\n\t"\
|
||||||
|
"testq $2,%%r11; jz "#ndim"774f;"\
|
||||||
|
GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\
|
||||||
|
#ndim"774:\n\t"\
|
||||||
|
"testq $1,%%r11; jz "#ndim"775f;"\
|
||||||
|
GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\
|
||||||
|
#ndim"775:\n\t"\
|
||||||
|
"movq %%r15,%0; movq %%r14,%1; vzeroupper;"\
|
||||||
|
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\
|
||||||
|
:"r11","r12","r13","r14","r15","cc","memory",\
|
||||||
|
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
|
||||||
|
a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\
|
||||||
|
}
|
||||||
|
|
||||||
|
static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){
|
||||||
|
FLOAT a0, b0;
|
||||||
|
int i, j, k;
|
||||||
|
for (i=n-1;i>=0;i--) {
|
||||||
|
b0 = b[i*n+i];
|
||||||
|
for (j=0;j<m;j++) {
|
||||||
|
a0 = c[i*ldc+j] * b0;
|
||||||
|
a[i*m+j] = c[i*ldc+j] = a0;
|
||||||
|
for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) {
|
||||||
|
BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C;
|
||||||
|
for(;m_count>7;m_count-=8){
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 8; c_ptr += 8;
|
||||||
|
}
|
||||||
|
for(;m_count>3;m_count-=4){
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 4; c_ptr += 4;
|
||||||
|
}
|
||||||
|
for(;m_count>1;m_count-=2){
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 2; c_ptr += 2;
|
||||||
|
}
|
||||||
|
if(m_count>0){
|
||||||
|
if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc);
|
||||||
|
solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc);
|
||||||
|
a_ptr += k * 1; c_ptr += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){
|
||||||
|
float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C;
|
||||||
|
float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0};
|
||||||
|
float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0};
|
||||||
|
uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0;
|
||||||
|
BLASLONG n_count = n;
|
||||||
|
if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;}
|
||||||
|
if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;}
|
||||||
|
for(;n_count>11;n_count-=12) COMPUTE(12)
|
||||||
|
for(;n_count>7;n_count-=8) COMPUTE(8)
|
||||||
|
for(;n_count>3;n_count-=4) COMPUTE(4)
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -0,0 +1,226 @@
|
||||||
|
/* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */
|
||||||
|
/* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */
|
||||||
|
/* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */
|
||||||
|
|
||||||
|
#define init_m8n4(c1,c2,c3,c4)\
|
||||||
|
"vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";"
|
||||||
|
#define INIT_m8n4 init_m8n4(4,5,6,7)
|
||||||
|
#define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11)
|
||||||
|
#define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15)
|
||||||
|
|
||||||
|
#define init_m4n4(c1,c2,c3,c4)\
|
||||||
|
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
"vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";"
|
||||||
|
#define INIT_m4n4 init_m4n4(4,5,6,7)
|
||||||
|
#define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11)
|
||||||
|
#define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15)
|
||||||
|
|
||||||
|
#define init_m2n4(c1,c2)\
|
||||||
|
"vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"
|
||||||
|
#define INIT_m2n4 init_m2n4(4,5)
|
||||||
|
#define INIT_m2n8 INIT_m2n4 init_m2n4(6,7)
|
||||||
|
#define INIT_m2n12 INIT_m2n8 init_m2n4(8,9)
|
||||||
|
|
||||||
|
#define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";"
|
||||||
|
#define INIT_m1n4 init_m1n4(4)
|
||||||
|
#define INIT_m1n8 INIT_m1n4 init_m1n4(5)
|
||||||
|
#define INIT_m1n12 INIT_m1n8 init_m1n4(6)
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m8n4 \
|
||||||
|
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
|
||||||
|
"vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\
|
||||||
|
"vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;"
|
||||||
|
#define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\
|
||||||
|
"vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\
|
||||||
|
"vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;"
|
||||||
|
#define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\
|
||||||
|
"vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\
|
||||||
|
"vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m4n4 \
|
||||||
|
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\
|
||||||
|
"vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
|
||||||
|
"vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
|
||||||
|
#define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\
|
||||||
|
"vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\
|
||||||
|
"vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;"
|
||||||
|
#define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\
|
||||||
|
"vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\
|
||||||
|
"vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m2n4 \
|
||||||
|
"vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\
|
||||||
|
"vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"
|
||||||
|
#define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\
|
||||||
|
"vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;"
|
||||||
|
#define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\
|
||||||
|
"vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"
|
||||||
|
|
||||||
|
#define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;"
|
||||||
|
#define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;"
|
||||||
|
#define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\
|
||||||
|
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
|
||||||
|
"vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\
|
||||||
|
"vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\
|
||||||
|
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
|
||||||
|
"vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\
|
||||||
|
"vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\
|
||||||
|
"vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\
|
||||||
|
"vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\
|
||||||
|
"vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\
|
||||||
|
"vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";"
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_2x4(c1,c2)\
|
||||||
|
"vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\
|
||||||
|
"vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\
|
||||||
|
"vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
|
||||||
|
"vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
|
||||||
|
#define GEMM_SUM_REORDER_1x4(c1)\
|
||||||
|
"vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\
|
||||||
|
"vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_le_m4n2(b_off,c1,...)\
|
||||||
|
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
|
||||||
|
"vmovsldup %%ymm"#c1",%%ymm1;"
|
||||||
|
|
||||||
|
#define SOLVE_le_m8n2(b_off,c1,c2,...)\
|
||||||
|
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;"
|
||||||
|
|
||||||
|
#define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_ri_m4n2(b_off,c1,...)\
|
||||||
|
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\
|
||||||
|
"vmovshdup %%ymm"#c1",%%ymm1;"
|
||||||
|
|
||||||
|
#define SOLVE_ri_m8n2(b_off,c1,c2,...)\
|
||||||
|
"vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\
|
||||||
|
"vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\
|
||||||
|
"vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;"
|
||||||
|
|
||||||
|
#define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col1_rtol_m1n4(b_off,c1,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
|
||||||
|
"vpermilps $0,%%xmm"#c1",%%xmm1;"
|
||||||
|
|
||||||
|
#define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
"vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;"
|
||||||
|
|
||||||
|
#define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col2_mul_m1n4(b_off,c1,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
|
||||||
|
"vpermilps $85,%%xmm"#c1",%%xmm1;"
|
||||||
|
|
||||||
|
#define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
"vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;"
|
||||||
|
|
||||||
|
#define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col3_mul_m1n4(b_off,c1,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
|
||||||
|
"vpermilps $170,%%xmm"#c1",%%xmm1;"
|
||||||
|
|
||||||
|
#define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
"vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;"
|
||||||
|
|
||||||
|
#define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SOLVE_col4_ltor_m1n4(b_off,c1,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\
|
||||||
|
"vpermilps $255,%%xmm"#c1",%%xmm1;"
|
||||||
|
|
||||||
|
#define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\
|
||||||
|
"vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\
|
||||||
|
"vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\
|
||||||
|
"vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;"
|
||||||
|
|
||||||
|
#define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\
|
||||||
|
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\
|
||||||
|
"vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";"
|
||||||
|
|
||||||
|
#define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";"
|
||||||
|
|
||||||
|
#define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";"
|
||||||
|
|
||||||
|
#define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";"
|
||||||
|
|
||||||
|
#define SAVE_SOLUTION_m8n2(c1,c2,a_off)\
|
||||||
|
"vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\
|
||||||
|
"vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\
|
||||||
|
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\
|
||||||
|
"vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define SAVE_SOLUTION_m4n2(c1,a_off)\
|
||||||
|
"vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\
|
||||||
|
"vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define SAVE_SOLUTION_m2n4(c1,c2,a_off)\
|
||||||
|
"vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"
|
||||||
|
|
||||||
|
#define SAVE_SOLUTION_m1n4(c1,a_off)\
|
||||||
|
"vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\
|
||||||
|
"vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"
|
|
@ -12,27 +12,6 @@
|
||||||
|
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
||||||
#ifdef __cplusplus
|
|
||||||
extern "C" {
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/*----------------------------------------------------------------------------*/
|
|
||||||
#ifndef lapack_int
|
|
||||||
#define lapack_int int
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifndef lapack_logical
|
|
||||||
#define lapack_logical lapack_int
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* f2c, hence clapack and MacOS Accelerate, returns double instead of float
|
|
||||||
* for sdot, slange, clange, etc. */
|
|
||||||
#if defined(LAPACK_F2C)
|
|
||||||
typedef double lapack_float_return;
|
|
||||||
#else
|
|
||||||
typedef float lapack_float_return;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
/* Complex types are structures equivalent to the
|
/* Complex types are structures equivalent to the
|
||||||
* Fortran complex types COMPLEX(4) and COMPLEX(8).
|
* Fortran complex types COMPLEX(4) and COMPLEX(8).
|
||||||
*
|
*
|
||||||
|
@ -88,6 +67,29 @@ extern "C" {
|
||||||
|
|
||||||
#endif /* LAPACK_COMPLEX_CUSTOM */
|
#endif /* LAPACK_COMPLEX_CUSTOM */
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/*----------------------------------------------------------------------------*/
|
||||||
|
#ifndef lapack_int
|
||||||
|
#define lapack_int int
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifndef lapack_logical
|
||||||
|
#define lapack_logical lapack_int
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* f2c, hence clapack and MacOS Accelerate, returns double instead of float
|
||||||
|
* for sdot, slange, clange, etc. */
|
||||||
|
#if defined(LAPACK_F2C)
|
||||||
|
typedef double lapack_float_return;
|
||||||
|
#else
|
||||||
|
typedef float lapack_float_return;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
/* Callback logical functions of one, two, or three arguments are used
|
/* Callback logical functions of one, two, or three arguments are used
|
||||||
* to select eigenvalues to sort to the top left of the Schur form.
|
* to select eigenvalues to sort to the top left of the Schur form.
|
||||||
* The value is selected if function returns TRUE (non-zero). */
|
* The value is selected if function returns TRUE (non-zero). */
|
||||||
|
|
|
@ -282,7 +282,8 @@
|
||||||
$ NPLUSONE
|
$ NPLUSONE
|
||||||
* ..
|
* ..
|
||||||
* .. External Subroutines ..
|
* .. External Subroutines ..
|
||||||
EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA
|
EXTERNAL SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM,
|
||||||
|
$XERBLA
|
||||||
* ..
|
* ..
|
||||||
* .. Intrinsic Functions ..
|
* .. Intrinsic Functions ..
|
||||||
INTRINSIC MAX, MIN
|
INTRINSIC MAX, MIN
|
||||||
|
@ -436,4 +437,4 @@
|
||||||
*
|
*
|
||||||
* End of SORHR_COL
|
* End of SORHR_COL
|
||||||
*
|
*
|
||||||
END
|
END
|
||||||
|
|
|
@ -1,7 +1,12 @@
|
||||||
TOPDIR = ..
|
TOPDIR = ..
|
||||||
include ../Makefile.system
|
include ../Makefile.system
|
||||||
|
|
||||||
|
|
||||||
|
ifeq ($(NOFORTRAN),1)
|
||||||
|
all ::
|
||||||
|
else
|
||||||
all :: level1 level2 level3
|
all :: level1 level2 level3
|
||||||
|
endif
|
||||||
|
|
||||||
level1 : sblat1 dblat1 cblat1 zblat1
|
level1 : sblat1 dblat1 cblat1 zblat1
|
||||||
ifndef CROSS
|
ifndef CROSS
|
||||||
|
|
Loading…
Reference in New Issue