Merge pull request #832 from wernsaar/develop
updated cgemm- and ctrmm-kernel for POWER8
This commit is contained in:
commit
e1cdd15b30
|
@ -33,6 +33,9 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread
|
||||||
# Apple vecLib
|
# Apple vecLib
|
||||||
LIBVECLIB = -framework Accelerate
|
LIBVECLIB = -framework Accelerate
|
||||||
|
|
||||||
|
ESSL=/opt/ibm/lib
|
||||||
|
LIBESSL = -lessl $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.2/lib/libxl.a
|
||||||
|
|
||||||
ifeq ($(OSNAME), WINNT)
|
ifeq ($(OSNAME), WINNT)
|
||||||
|
|
||||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||||
|
@ -255,7 +258,8 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \
|
||||||
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \
|
||||||
|
cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl
|
||||||
|
|
||||||
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \
|
||||||
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \
|
||||||
|
@ -443,6 +447,9 @@ sgemm.mkl : sgemm.$(SUFFIX)
|
||||||
sgemm.veclib : sgemm.$(SUFFIX)
|
sgemm.veclib : sgemm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
sgemm.essl : sgemm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Dgemm ####################################################
|
##################################### Dgemm ####################################################
|
||||||
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
|
dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -459,6 +466,9 @@ dgemm.mkl : dgemm.$(SUFFIX)
|
||||||
dgemm.veclib : dgemm.$(SUFFIX)
|
dgemm.veclib : dgemm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dgemm.essl : dgemm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Cgemm ####################################################
|
##################################### Cgemm ####################################################
|
||||||
|
|
||||||
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
|
cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
@ -476,6 +486,9 @@ cgemm.mkl : cgemm.$(SUFFIX)
|
||||||
cgemm.veclib : cgemm.$(SUFFIX)
|
cgemm.veclib : cgemm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
cgemm.essl : cgemm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Zgemm ####################################################
|
##################################### Zgemm ####################################################
|
||||||
|
|
||||||
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
|
zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
@ -493,6 +506,9 @@ zgemm.mkl : zgemm.$(SUFFIX)
|
||||||
zgemm.veclib : zgemm.$(SUFFIX)
|
zgemm.veclib : zgemm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
zgemm.essl : zgemm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Ssymm ####################################################
|
##################################### Ssymm ####################################################
|
||||||
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
|
ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -575,6 +591,9 @@ strmm.mkl : strmm.$(SUFFIX)
|
||||||
strmm.veclib : strmm.$(SUFFIX)
|
strmm.veclib : strmm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
strmm.essl : strmm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Dtrmm ####################################################
|
##################################### Dtrmm ####################################################
|
||||||
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
|
dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -591,6 +610,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX)
|
||||||
dtrmm.veclib : dtrmm.$(SUFFIX)
|
dtrmm.veclib : dtrmm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
dtrmm.essl : dtrmm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Ctrmm ####################################################
|
##################################### Ctrmm ####################################################
|
||||||
|
|
||||||
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
|
ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
@ -608,6 +630,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX)
|
||||||
ctrmm.veclib : ctrmm.$(SUFFIX)
|
ctrmm.veclib : ctrmm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ctrmm.essl : ctrmm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Ztrmm ####################################################
|
##################################### Ztrmm ####################################################
|
||||||
|
|
||||||
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
|
ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME)
|
||||||
|
@ -625,6 +650,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX)
|
||||||
ztrmm.veclib : ztrmm.$(SUFFIX)
|
ztrmm.veclib : ztrmm.$(SUFFIX)
|
||||||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
|
ztrmm.essl : ztrmm.$(SUFFIX)
|
||||||
|
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||||
|
|
||||||
##################################### Strsm ####################################################
|
##################################### Strsm ####################################################
|
||||||
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
|
strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||||
|
@ -2179,7 +2207,7 @@ smallscaling: smallscaling.c ../$(LIBNAME)
|
||||||
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
$(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm
|
||||||
|
|
||||||
clean ::
|
clean ::
|
||||||
@rm -f *.goto *.mkl *.acml *.atlas *.veclib
|
@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl
|
||||||
|
|
||||||
include $(TOPDIR)/Makefile.tail
|
include $(TOPDIR)/Makefile.tail
|
||||||
|
|
||||||
|
|
|
@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
/**************************************************************************************
|
/**************************************************************************************
|
||||||
* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
|
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||||
* BLASTEST : OK
|
* BLASTEST : OK
|
||||||
* CTEST : OK
|
* CTEST : OK
|
||||||
* TEST : OK
|
* TEST : OK
|
||||||
* LAPACK-TEST : OK
|
* LAPACK-TEST : OK
|
||||||
**************************************************************************************/
|
**************************************************************************************/
|
||||||
|
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
@ -130,10 +130,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define o0 0
|
#define o0 0
|
||||||
#define alpha_r vs30
|
|
||||||
#define alpha_i vs31
|
|
||||||
|
|
||||||
#define TBUFFER r14
|
#define alpha_dr vs28
|
||||||
|
#define alpha_di vs29
|
||||||
|
#define alpha_sr vs30
|
||||||
|
#define alpha_si vs31
|
||||||
|
|
||||||
|
|
||||||
|
#define NOTUSED r14
|
||||||
#define L r15
|
#define L r15
|
||||||
#define o12 r16
|
#define o12 r16
|
||||||
#define o4 r17
|
#define o4 r17
|
||||||
|
@ -271,21 +275,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "cgemm_macros_8x4_power8.S"
|
#include "cgemm_macros_8x4_power8.S"
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
cmpwi cr0, N, 0
|
cmpwi cr0, N, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
cmpwi cr0, K, 0
|
cmpwi cr0, K, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
|
|
||||||
slwi LDC, LDC, ZBASE_SHIFT
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
li PRE, 256
|
li PRE, 384
|
||||||
li o4 , 4
|
li o4 , 4
|
||||||
li o8 , 8
|
li o8 , 8
|
||||||
li o12 , 12
|
li o12 , 12
|
||||||
li o16 , 16
|
li o16 , 16
|
||||||
li o32 , 32
|
li o32 , 32
|
||||||
li o48 , 48
|
li o48 , 48
|
||||||
addi TBUFFER, SP, 360
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __64BIT__
|
#ifdef __64BIT__
|
||||||
|
@ -294,14 +297,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi T1 , SP, 224
|
addi T1 , SP, 224
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
lxsspx alpha_r, 0, T1
|
stxsspx vs1, 0, T1
|
||||||
lxsspx alpha_i, o8, T1
|
lxsspx alpha_dr, 0, T1
|
||||||
|
stxsspx vs2, o8 , T1
|
||||||
|
lxsspx alpha_di, o8, T1
|
||||||
|
addi T1, SP, 360
|
||||||
|
li T2, 0
|
||||||
|
|
||||||
|
stw T2, 0(T1)
|
||||||
|
stw T2, 4(T1)
|
||||||
|
stw T2, 8(T1)
|
||||||
|
stxsspx alpha_dr, o12, T1
|
||||||
|
lxvw4x alpha_sr, o0 , T1
|
||||||
|
addi T1, T1, 16
|
||||||
|
|
||||||
|
stw T2, 0(T1)
|
||||||
|
stw T2, 4(T1)
|
||||||
|
stw T2, 8(T1)
|
||||||
|
stxsspx alpha_di, o12, T1
|
||||||
|
lxvw4x alpha_si, o0 , T1
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
|
|
||||||
#include "cgemm_logic_8x4_power8.S"
|
#include "cgemm_logic_8x4_power8.S"
|
||||||
|
|
||||||
.L999:
|
L999:
|
||||||
addi r3, 0, 0
|
addi r3, 0, 0
|
||||||
|
|
||||||
lfd f14, 0(SP)
|
lfd f14, 0(SP)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -26,11 +26,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*****************************************************************************/
|
*****************************************************************************/
|
||||||
|
|
||||||
/**************************************************************************************
|
/**************************************************************************************
|
||||||
* 2016/03/18 Werner Saar (wernsaar@googlemail.com)
|
* 2016/04/03 Werner Saar (wernsaar@googlemail.com)
|
||||||
* BLASTEST : OK
|
* BLASTEST : OK
|
||||||
* CTEST : OK
|
* CTEST : OK
|
||||||
* TEST : OK
|
* TEST : OK
|
||||||
* LAPACK-TEST : OK
|
* LAPACK-TEST : OK
|
||||||
**************************************************************************************/
|
**************************************************************************************/
|
||||||
|
|
||||||
/*********************************************************************/
|
/*********************************************************************/
|
||||||
|
@ -129,18 +129,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define o0 0
|
#define o0 0
|
||||||
#define alpha_r vs30
|
|
||||||
#define alpha_i vs31
|
|
||||||
#define alpha_vr vs28
|
|
||||||
#define alpha_vi vs29
|
|
||||||
|
|
||||||
|
#define alpha_dr vs28
|
||||||
|
#define alpha_di vs29
|
||||||
|
#define alpha_sr vs30
|
||||||
|
#define alpha_si vs31
|
||||||
|
|
||||||
#define o12 r12
|
#define o12 r12
|
||||||
#define KKK r13
|
#define KKK r13
|
||||||
#define K1 r14
|
#define K1 r14
|
||||||
#define L r15
|
#define L r15
|
||||||
#define o16 r16
|
#define o16 r16
|
||||||
#define TBUFFER r17
|
#define NOTUSED r17
|
||||||
#define T2 r19
|
#define T2 r19
|
||||||
#define KK r20
|
#define KK r20
|
||||||
#define o8 r21
|
#define o8 r21
|
||||||
|
@ -278,21 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#include "cgemm_macros_8x4_power8.S"
|
#include "cgemm_macros_8x4_power8.S"
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
cmpwi cr0, N, 0
|
cmpwi cr0, N, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
cmpwi cr0, K, 0
|
cmpwi cr0, K, 0
|
||||||
ble .L999_H1
|
ble L999_H1
|
||||||
|
|
||||||
slwi LDC, LDC, ZBASE_SHIFT
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
li PRE, 256
|
li PRE, 384
|
||||||
li o4 , 4
|
li o4 , 4
|
||||||
li o8 , 8
|
li o8 , 8
|
||||||
li o12 , 12
|
li o12 , 12
|
||||||
li o16 , 16
|
li o16 , 16
|
||||||
li o32 , 32
|
li o32 , 32
|
||||||
li o48 , 48
|
li o48 , 48
|
||||||
addi TBUFFER, SP, 360
|
|
||||||
|
|
||||||
|
|
||||||
#ifdef __64BIT__
|
#ifdef __64BIT__
|
||||||
|
@ -301,14 +300,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi T1, SP, 224
|
addi T1, SP, 224
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
lxsspx alpha_r, 0, T1
|
lxsspx alpha_dr, 0, T1
|
||||||
lxsspx alpha_i, o8, T1
|
lxsspx alpha_di, o8, T1
|
||||||
|
addi T1, SP, 360
|
||||||
|
li T2, 0
|
||||||
|
|
||||||
|
stw T2, 0(T1)
|
||||||
|
stw T2, 4(T1)
|
||||||
|
stw T2, 8(T1)
|
||||||
|
stxsspx alpha_dr, o12, T1
|
||||||
|
lxvw4x alpha_sr, o0 , T1
|
||||||
|
addi T1, T1, 16
|
||||||
|
|
||||||
|
stw T2, 0(T1)
|
||||||
|
stw T2, 4(T1)
|
||||||
|
stw T2, 8(T1)
|
||||||
|
stxsspx alpha_di, o12, T1
|
||||||
|
lxvw4x alpha_si, o0 , T1
|
||||||
|
|
||||||
.align 5
|
.align 5
|
||||||
|
|
||||||
#include "ctrmm_logic_8x4_power8.S"
|
#include "ctrmm_logic_8x4_power8.S"
|
||||||
|
|
||||||
.L999:
|
L999:
|
||||||
addi r3, 0, 0
|
addi r3, 0, 0
|
||||||
|
|
||||||
lfd f14, 0(SP)
|
lfd f14, 0(SP)
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue