added dgemm-, dtrmm-, zgemm- and ztrmm-kernel for power8

This commit is contained in:
Werner Saar 2016-03-01 07:33:56 +01:00
parent 3e8d6ea74f
commit b752858d6c
28 changed files with 14013 additions and 166 deletions

View File

@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
#endif
#
#ifndef L1_PREFETCH
#define L1_PREFETCH dcbt
#endif

View File

@ -66,7 +66,7 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8",
"POWER8"
};
char *lowercpuname[] = {
@ -78,7 +78,7 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8",
"power8"
};
char *corename[] = {
@ -90,7 +90,7 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8",
"POWER8"
};
int detect(void){

View File

@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER5"
#endif
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER6"
@ -565,7 +565,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER6"
#endif
#if defined(FORCE_POWER8)
#if defined(FORCE_POWER8)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER8"
@ -578,6 +578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER8"
#endif
#ifdef FORCE_PPCG4
#define FORCE
#define ARCHITECTURE "POWER"

View File

@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
SKERNELOBJS += \

View File

@ -1,57 +1,3 @@
SGEMM_BETA = gemm_beta.S
DGEMM_BETA = gemm_beta.S
CGEMM_BETA = zgemm_beta.S
ZGEMM_BETA = zgemm_beta.S
ifndef SSYMV_U_KERNEL
SSYMV_U_KERNEL = symv_U.S
endif
ifndef SSYMV_L_KERNEL
SSYMV_L_KERNEL = symv_L.S
endif
ifndef DSYMV_U_KERNEL
DSYMV_U_KERNEL = symv_U.S
endif
ifndef DSYMV_L_KERNEL
DSYMV_L_KERNEL = symv_L.S
endif
ifndef CSYMV_U_KERNEL
CSYMV_U_KERNEL = zsymv_U.S
endif
ifndef CSYMV_L_KERNEL
CSYMV_L_KERNEL = zsymv_L.S
endif
ifndef ZSYMV_U_KERNEL
ZSYMV_U_KERNEL = zsymv_U.S
endif
ifndef ZSYMV_L_KERNEL
ZSYMV_L_KERNEL = zsymv_L.S
endif
ifndef CHEMV_U_KERNEL
CHEMV_U_KERNEL = zsymv_U.S
endif
ifndef CHEMV_L_KERNEL
CHEMV_L_KERNEL = zsymv_L.S
endif
ifndef ZHEMV_U_KERNEL
ZHEMV_U_KERNEL = zsymv_U.S
endif
ifndef ZHEMV_L_KERNEL
ZHEMV_L_KERNEL = zsymv_L.S
endif
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
endif

View File

@ -1,56 +1,173 @@
SGEMMKERNEL = gemm_kernel_power6.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = gemm_ncopy_4.S
SGEMMOTCOPY = gemm_tcopy_4.S
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_power6.S
DGEMMINCOPY =
DGEMMITCOPY =
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_power6.S
CGEMMINCOPY = ../generic/zgemm_ncopy_2.c
CGEMMITCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_power6.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
STRSMKERNEL_LN = trsm_kernel_power6_LN.S
STRSMKERNEL_LT = trsm_kernel_power6_LT.S
STRSMKERNEL_RN = trsm_kernel_power6_LT.S
STRSMKERNEL_RT = trsm_kernel_power6_RT.S
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
DTRSMKERNEL_LN = trsm_kernel_power6_LN.S
DTRSMKERNEL_LT = trsm_kernel_power6_LT.S
DTRSMKERNEL_RN = trsm_kernel_power6_LT.S
DTRSMKERNEL_RT = trsm_kernel_power6_RT.S
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
CTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
CTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
CTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
CTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ztrsm_kernel_power6_LN.S
ZTRSMKERNEL_LT = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RN = ztrsm_kernel_power6_LT.S
ZTRSMKERNEL_RT = ztrsm_kernel_power6_RT.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
#Pure C for other kernels
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../arm/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

64
kernel/power/def_vsx.h Normal file
View File

@ -0,0 +1,64 @@
#define vs0 0
#define vs1 1
#define vs2 2
#define vs3 3
#define vs4 4
#define vs5 5
#define vs6 6
#define vs7 7
#define vs8 8
#define vs9 9
#define vs10 10
#define vs11 11
#define vs12 12
#define vs13 13
#define vs14 14
#define vs15 15
#define vs16 16
#define vs17 17
#define vs18 18
#define vs19 19
#define vs20 20
#define vs21 21
#define vs22 22
#define vs23 23
#define vs24 24
#define vs25 25
#define vs26 26
#define vs27 27
#define vs28 28
#define vs29 29
#define vs30 30
#define vs31 31
#define vs32 32
#define vs33 33
#define vs34 34
#define vs35 35
#define vs36 36
#define vs37 37
#define vs38 38
#define vs39 39
#define vs40 40
#define vs41 41
#define vs42 42
#define vs43 43
#define vs44 44
#define vs45 45
#define vs46 46
#define vs47 47
#define vs48 48
#define vs49 49
#define vs50 50
#define vs51 51
#define vs52 52
#define vs53 53
#define vs54 54
#define vs55 55
#define vs56 56
#define vs57 57
#define vs58 58
#define vs59 59
#define vs60 60
#define vs61 61
#define vs62 62
#define vs63 63

View File

@ -0,0 +1,313 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs18
#define o0 0
#define o8 r15
#define o24 r16
#define ALPHA r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
#endif
stfd f1, ALPHA_SP
stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#endif
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
li PRE, 256
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
lxvdsx alpha_r, 0, ALPHA
#include "dgemm_logic_16x4_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,327 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs18
#define o0 0
#define K1 r13
#define KKK r14
#define o8 r15
#define o24 r16
#define ALPHA r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
stfd f1, ALPHA_SP
stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#endif
mr KK, OFFSET
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, KK
#endif
cmpwi cr0, M, 0
ble L999_H1
cmpwi cr0, N, 0
ble L999_H1
cmpwi cr0, K, 0
ble L999_H1
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
li PRE, 256
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
lxvdsx alpha_r, 0, ALPHA
#include "dtrmm_logic_16x4_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

View File

@ -104,12 +104,12 @@
#define PREFETCHWSIZE 72
#endif
#ifdef POWER8
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 72
#endif
#ifdef PPCG4
#ifdef POWER8
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 72
#endif
@ -198,7 +198,7 @@ LL(12):
STFD c12, 14 * SIZE(B)
STFD c16, 15 * SIZE(B)
#ifdef POWER6
#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3

View File

@ -108,12 +108,12 @@
#define PREFETCHWSIZE 48
#endif
#ifdef POWER8
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 48
#endif
#ifdef PPCG4
#ifdef POWER8
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 48
#endif
@ -229,7 +229,7 @@ LL(12):
STFD c15, 14 * SIZE(B1)
STFD c16, 15 * SIZE(B1)
#ifdef POWER6
#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3

View File

@ -174,11 +174,6 @@
#define PREFETCHSIZE_C 40
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 96
#define PREFETCHSIZE_C 40
#endif
#ifndef NEEDPARAM
#ifndef __64BIT__

View File

@ -139,11 +139,6 @@
#define PREFETCHSIZE_C 8
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 96
#define PREFETCHSIZE_C 8
#endif
#define y01 f0
#define y02 f1
#define y03 f2

View File

@ -168,11 +168,7 @@
#define PREFETCHSIZE_A 40
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 40
#endif
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
#define NOP1
#define NOP2
#else

View File

@ -167,11 +167,7 @@
#define PREFETCHSIZE_A 40
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 40
#endif
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
#define NOP1
#define NOP2
#else

View File

@ -0,0 +1,332 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_r vs30
#define alpha_i vs31
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
#define KK r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
#endif
stfd f1, ALPHA_R_SP
stfd f2, ALPHA_I_SP
stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble L999
cmpwi cr0, N, 0
ble L999
cmpwi cr0, K, 0
ble L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
lxvdsx alpha_r, 0, ALPHA
lxvdsx alpha_i, o8, ALPHA
.align 5
#include "zgemm_logic_8x2_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

View File

@ -0,0 +1,901 @@
srawi. J, N, 1
ble ZGEMM_L2_END
ZGEMM_L2_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 3
ble ZGEMM_L2x8_END
ZGEMM_L2x8_BEGIN:
mr BO, B
srawi. L, K, 3
ble ZGEMM_L2x8_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L2x8_SUB4
ZGEMM_L2x8_LOOP_START:
dcbt AO, PRE
LOAD2x8_1
dcbt AO, PRE
KERNEL2x8_I1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -2
ble ZGEMM_L2x8_LOOP_END
.align 5
ZGEMM_L2x8_LOOP:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -1
bgt ZGEMM_L2x8_LOOP
ZGEMM_L2x8_LOOP_END:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
KERNEL2x8_E2
b ZGEMM_L2x8_SUB1
ZGEMM_L2x8_SUB4:
dcbt AO, PRE
KERNEL2x8_SUBI1
dcbt AO, PRE
KERNEL2x8_SUB1
dcbt AO, PRE
KERNEL2x8_SUB1
dcbt AO, PRE
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
b ZGEMM_L2x8_SUB1
ZGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
ble ZGEMM_L2x8_SAVE
b ZGEMM_L2x8_SUB2
ZGEMM_L2x8_SUB1:
andi. L, K, 7
ble ZGEMM_L2x8_SAVE
ZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
bgt ZGEMM_L2x8_SUB2
ZGEMM_L2x8_SAVE:
SAVE2x8
addic. I, I, -1
bgt ZGEMM_L2x8_BEGIN
ZGEMM_L2x8_END:
ZGEMM_L2x4_BEGIN:
andi. T2, M, 7
ble ZGEMM_L2x1_END
andi. T1, M, 4
ble ZGEMM_L2x4_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L2x4_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L2x4_SUB4
ZGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
addic. L, L, -2
ble ZGEMM_L2x4_LOOP_END
.align 5
ZGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
addic. L, L, -1
bgt ZGEMM_L2x4_LOOP
ZGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_E2
b ZGEMM_L2x4_SUB1
ZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
b ZGEMM_L2x4_SUB1
ZGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
ble ZGEMM_L2x4_SAVE
b ZGEMM_L2x4_SUB2
ZGEMM_L2x4_SUB1:
andi. L, K, 7
ble ZGEMM_L2x4_SAVE
ZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
bgt ZGEMM_L2x4_SUB2
ZGEMM_L2x4_SAVE:
SAVE2x4
ZGEMM_L2x4_END:
ZGEMM_L2x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L2x2_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L2x2_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L2x2_SUB4
ZGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
addic. L, L, -2
ble ZGEMM_L2x2_LOOP_END
.align 5
ZGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
addic. L, L, -1
bgt ZGEMM_L2x2_LOOP
ZGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_E2
b ZGEMM_L2x2_SUB1
ZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
b ZGEMM_L2x2_SUB1
ZGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
ble ZGEMM_L2x2_SAVE
b ZGEMM_L2x2_SUB2
ZGEMM_L2x2_SUB1:
andi. L, K, 7
ble ZGEMM_L2x2_SAVE
ZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
bgt ZGEMM_L2x2_SUB2
ZGEMM_L2x2_SAVE:
SAVE2x2
ZGEMM_L2x2_END:
ZGEMM_L2x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L2x1_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L2x1_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L2x1_SUB4
ZGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
addic. L, L, -2
ble ZGEMM_L2x1_LOOP_END
.align 5
ZGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
addic. L, L, -1
bgt ZGEMM_L2x1_LOOP
ZGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_E2
b ZGEMM_L2x1_SUB1
ZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
b ZGEMM_L2x1_SUB1
ZGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
ble ZGEMM_L2x1_SAVE
b ZGEMM_L2x1_SUB2
ZGEMM_L2x1_SUB1:
andi. L, K, 7
ble ZGEMM_L2x1_SAVE
ZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
bgt ZGEMM_L2x1_SUB2
ZGEMM_L2x1_SAVE:
SAVE2x1
ZGEMM_L2x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
bgt ZGEMM_L2_BEGIN
andi. T2, N, 1
ble L999
ZGEMM_L2_END:
b ZGEMM_L1_BEGIN
L999_H1:
b L999
ZGEMM_L1_BEGIN:
andi. T1, N, 1
ble ZGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 3
ble ZGEMM_L1x8_END
ZGEMM_L1x8_BEGIN:
mr BO, B
srawi. L, K, 3
ble ZGEMM_L1x8_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L1x8_SUB4
ZGEMM_L1x8_LOOP_START:
dcbt AO, PRE
LOAD1x8_1
dcbt AO, PRE
KERNEL1x8_I1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -2
ble ZGEMM_L1x8_LOOP_END
.align 5
ZGEMM_L1x8_LOOP:
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -1
bgt ZGEMM_L1x8_LOOP
ZGEMM_L1x8_LOOP_END:
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
KERNEL1x8_E2
b ZGEMM_L1x8_SUB1
ZGEMM_L1x8_SUB4:
dcbt AO, PRE
KERNEL1x8_SUBI1
dcbt AO, PRE
KERNEL1x8_SUB1
dcbt AO, PRE
KERNEL1x8_SUB1
dcbt AO, PRE
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
b ZGEMM_L1x8_SUB1
ZGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
ble ZGEMM_L1x8_SAVE
b ZGEMM_L1x8_SUB2
ZGEMM_L1x8_SUB1:
andi. L, K, 7
ble ZGEMM_L1x8_SAVE
ZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
bgt ZGEMM_L1x8_SUB2
ZGEMM_L1x8_SAVE:
SAVE1x8
addic. I, I, -1
bgt ZGEMM_L1x8_BEGIN
ZGEMM_L1x8_END:
ZGEMM_L1x4_BEGIN:
andi. T2, M, 7
ble ZGEMM_L1x1_END
andi. T1, M, 4
ble ZGEMM_L1x4_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L1x4_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L1x4_SUB4
ZGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
addic. L, L, -2
ble ZGEMM_L1x4_LOOP_END
.align 5
ZGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
addic. L, L, -1
bgt ZGEMM_L1x4_LOOP
ZGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_E2
b ZGEMM_L1x4_SUB1
ZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
b ZGEMM_L1x4_SUB1
ZGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
ble ZGEMM_L1x4_SAVE
b ZGEMM_L1x4_SUB2
ZGEMM_L1x4_SUB1:
andi. L, K, 7
ble ZGEMM_L1x4_SAVE
ZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
bgt ZGEMM_L1x4_SUB2
ZGEMM_L1x4_SAVE:
SAVE1x4
ZGEMM_L1x4_END:
ZGEMM_L1x2_BEGIN:
andi. T1, M, 2
ble ZGEMM_L1x2_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L1x2_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L1x2_SUB4
ZGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
addic. L, L, -2
ble ZGEMM_L1x2_LOOP_END
.align 5
ZGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
addic. L, L, -1
bgt ZGEMM_L1x2_LOOP
ZGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_E2
b ZGEMM_L1x2_SUB1
ZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
b ZGEMM_L1x2_SUB1
ZGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
ble ZGEMM_L1x2_SAVE
b ZGEMM_L1x2_SUB2
ZGEMM_L1x2_SUB1:
andi. L, K, 7
ble ZGEMM_L1x2_SAVE
ZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
bgt ZGEMM_L1x2_SUB2
ZGEMM_L1x2_SAVE:
SAVE1x2
ZGEMM_L1x2_END:
ZGEMM_L1x1_BEGIN:
andi. T1, M, 1
ble ZGEMM_L1x1_END
mr BO, B
srawi. L, K, 3
ble ZGEMM_L1x1_SUB0
cmpwi cr0, L, 1
ble ZGEMM_L1x1_SUB4
ZGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
addic. L, L, -2
ble ZGEMM_L1x1_LOOP_END
.align 5
ZGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
addic. L, L, -1
bgt ZGEMM_L1x1_LOOP
ZGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_E2
b ZGEMM_L1x1_SUB1
ZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
b ZGEMM_L1x1_SUB1
ZGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
ble ZGEMM_L1x1_SAVE
b ZGEMM_L1x1_SUB2
ZGEMM_L1x1_SUB1:
andi. L, K, 7
ble ZGEMM_L1x1_SAVE
ZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
bgt ZGEMM_L1x1_SUB2
ZGEMM_L1x1_SAVE:
SAVE1x1
ZGEMM_L1x1_END:
ZGEMM_L1_END:

File diff suppressed because it is too large Load Diff

View File

@ -170,11 +170,6 @@
#define PREFETCHSIZE_C 24
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 24
#define PREFETCHSIZE_C 24
#endif
#ifndef XCONJ
#define FMADDR FMADD
#define FMSUBR FNMSUB

View File

@ -144,11 +144,6 @@
#define PREFETCHSIZE_C 8
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 24
#define PREFETCHSIZE_C 8
#endif
#if !(defined(CONJ) && defined(XCONJ))
#define FMADDR FMADD
#define FMSUBR FNMSUB

View File

@ -169,11 +169,7 @@
#define PREFETCHSIZE_A 112
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 112
#endif
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
#define NOP1
#define NOP2
#else

View File

@ -166,11 +166,7 @@
#define PREFETCHSIZE_A 112
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 112
#endif
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970) || defined(POWER8)
#if defined(POWER4) || defined(POWER5) || defined(POWER6) || defined(PPC970)
#define NOP1
#define NOP2
#else

View File

@ -0,0 +1,342 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_r vs30
#define alpha_i vs31
#define KKK r13
#define K1 r14
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
#define KK r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
stfd f1, ALPHA_R_SP
stfd f2, ALPHA_I_SP
stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble L999
cmpwi cr0, N, 0
ble L999
cmpwi cr0, K, 0
ble L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
lxsdx alpha_r, 0, ALPHA
lxsdx alpha_i, o8, ALPHA
.align 4
#include "ztrmm_logic_8x2_power8.S"
L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

24
param.h
View File

@ -1962,35 +1962,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(POWER8)
#define SNUMOPT 4
#define DNUMOPT 4
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 384
#define GEMM_DEFAULT_OFFSET_B 1024
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 992
#define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 488
#define ZGEMM_DEFAULT_P 248
#define ZGEMM_DEFAULT_P 240
#define SGEMM_DEFAULT_Q 504
#define DGEMM_DEFAULT_Q 504
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 400
#define ZGEMM_DEFAULT_Q 400
#define ZGEMM_DEFAULT_Q 360
#define DGEMM_DEFAULT_R 14400
#define ZGEMM_DEFAULT_R 7200
#define SYMV_P 8
#endif
#if defined(SPARC) && defined(V7)
#define SNUMOPT 4