Merge remote-tracking branch 'origin/power8' into develop

Refs #774
This commit is contained in:
Zhang Xianyi 2016-03-05 06:03:19 -05:00
commit 8c43d7fa5f
24 changed files with 14420 additions and 59 deletions

View File

@ -236,7 +236,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8)
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@ -258,6 +258,13 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define L1_PREFETCH dcbtst
#endif
#if defined(POWER8)
#define L1_DUALFETCH
#define L1_PREFETCHSIZE (16 + 128 * 100)
#define L1_PREFETCH dcbtst
#endif
#
#ifndef L1_PREFETCH
#define L1_PREFETCH dcbt
#endif
@ -790,6 +797,8 @@ Lmcount$lazy_ptr:
#define BUFFER_SIZE ( 2 << 20)
#elif defined(PPC440FP2)
#define BUFFER_SIZE ( 16 << 20)
#elif defined(POWER8)
#define BUFFER_SIZE ( 64 << 20)
#else
#define BUFFER_SIZE ( 16 << 20)
#endif

View File

@ -55,6 +55,7 @@
#define CPUTYPE_POWER6 5
#define CPUTYPE_CELL 6
#define CPUTYPE_PPCG4 7
#define CPUTYPE_POWER8 8
char *cpuname[] = {
"UNKNOWN",
@ -65,6 +66,7 @@ char *cpuname[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
char *lowercpuname[] = {
@ -76,6 +78,7 @@ char *lowercpuname[] = {
"power6",
"cell",
"ppcg4",
"power8"
};
char *corename[] = {
@ -87,6 +90,7 @@ char *corename[] = {
"POWER6",
"CELL",
"PPCG4",
"POWER8"
};
int detect(void){
@ -115,7 +119,7 @@ int detect(void){
if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5;
if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER6;
if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8;
if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL;
if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4;

View File

@ -552,7 +552,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER5"
#endif
#if defined(FORCE_POWER6) || defined(FORCE_POWER7) || defined(FORCE_POWER8)
#if defined(FORCE_POWER6) || defined(FORCE_POWER7)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER6"
@ -565,6 +565,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "POWER6"
#endif
#if defined(FORCE_POWER8)
#define FORCE
#define ARCHITECTURE "POWER"
#define SUBARCHITECTURE "POWER8"
#define SUBDIRNAME "power"
#define ARCHCONFIG "-DPOWER8 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=128 " \
"-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \
"-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
#define LIBNAME "power8"
#define CORENAME "POWER8"
#endif
#ifdef FORCE_PPCG4
#define FORCE
#define ARCHITECTURE "POWER"

View File

@ -36,6 +36,11 @@ ifeq ($(CORE), HASWELL)
USE_TRMM = 1
endif
ifeq ($(CORE), POWER8)
USE_TRMM = 1
endif
SKERNELOBJS += \

View File

@ -1,57 +1,3 @@
SGEMM_BETA = gemm_beta.S
DGEMM_BETA = gemm_beta.S
CGEMM_BETA = zgemm_beta.S
ZGEMM_BETA = zgemm_beta.S
ifndef SSYMV_U_KERNEL
SSYMV_U_KERNEL = symv_U.S
endif
ifndef SSYMV_L_KERNEL
SSYMV_L_KERNEL = symv_L.S
endif
ifndef DSYMV_U_KERNEL
DSYMV_U_KERNEL = symv_U.S
endif
ifndef DSYMV_L_KERNEL
DSYMV_L_KERNEL = symv_L.S
endif
ifndef CSYMV_U_KERNEL
CSYMV_U_KERNEL = zsymv_U.S
endif
ifndef CSYMV_L_KERNEL
CSYMV_L_KERNEL = zsymv_L.S
endif
ifndef ZSYMV_U_KERNEL
ZSYMV_U_KERNEL = zsymv_U.S
endif
ifndef ZSYMV_L_KERNEL
ZSYMV_L_KERNEL = zsymv_L.S
endif
ifndef CHEMV_U_KERNEL
CHEMV_U_KERNEL = zsymv_U.S
endif
ifndef CHEMV_L_KERNEL
CHEMV_L_KERNEL = zsymv_L.S
endif
ifndef ZHEMV_U_KERNEL
ZHEMV_U_KERNEL = zsymv_U.S
endif
ifndef ZHEMV_L_KERNEL
ZHEMV_L_KERNEL = zsymv_L.S
endif
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
endif
@ -84,3 +30,19 @@ ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifndef SGEMM_BETA
SGEMM_BETA = gemm_beta.S
endif
ifndef DGEMM_BETA
DGEMM_BETA = gemm_beta.S
endif
ifndef CGEMM_BETA
CGEMM_BETA = zgemm_beta.S
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = zgemm_beta.S
endif

175
kernel/power/KERNEL.POWER8 Normal file
View File

@ -0,0 +1,175 @@
#SGEMM_BETA = ../generic/gemm_beta.c
#DGEMM_BETA = ../generic/gemm_beta.c
#CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = gemm_kernel_power6.S
DTRMMKERNEL = dtrmm_kernel_16x4_power8.S
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S
SGEMMKERNEL = gemm_kernel_power6.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_16x4_power8.S
DGEMMINCOPY = ../generic/gemm_ncopy_16.c
DGEMMITCOPY = ../generic/gemm_tcopy_16.c
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_8x2_power8.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_8.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S
#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S
#Pure C for other kernels
#SAMAXKERNEL = ../arm/amax.c
#DAMAXKERNEL = ../arm/amax.c
#CAMAXKERNEL = ../arm/zamax.c
#ZAMAXKERNEL = ../arm/zamax.c
#
#SAMINKERNEL = ../arm/amin.c
#DAMINKERNEL = ../arm/amin.c
#CAMINKERNEL = ../arm/zamin.c
#ZAMINKERNEL = ../arm/zamin.c
#
#SMAXKERNEL = ../arm/max.c
#DMAXKERNEL = ../arm/max.c
#
#SMINKERNEL = ../arm/min.c
#DMINKERNEL = ../arm/min.c
#
#ISAMAXKERNEL = ../arm/iamax.c
#IDAMAXKERNEL = ../arm/iamax.c
#ICAMAXKERNEL = ../arm/izamax.c
#IZAMAXKERNEL = ../arm/izamax.c
#
#ISAMINKERNEL = ../arm/iamin.c
#IDAMINKERNEL = ../arm/iamin.c
#ICAMINKERNEL = ../arm/izamin.c
#IZAMINKERNEL = ../arm/izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
#IDMAXKERNEL = ../arm/imax.c
#
#ISMINKERNEL = ../arm/imin.c
#IDMINKERNEL = ../arm/imin.c
#
#SASUMKERNEL = ../arm/asum.c
#DASUMKERNEL = ../arm/asum.c
#CASUMKERNEL = ../arm/zasum.c
#ZASUMKERNEL = ../arm/zasum.c
#
#SAXPYKERNEL = ../arm/axpy.c
#DAXPYKERNEL = ../arm/axpy.c
#CAXPYKERNEL = ../arm/zaxpy.c
#ZAXPYKERNEL = ../arm/zaxpy.c
#
#SCOPYKERNEL = ../arm/copy.c
#DCOPYKERNEL = ../arm/copy.c
#CCOPYKERNEL = ../arm/zcopy.c
#ZCOPYKERNEL = ../arm/zcopy.c
#
#SDOTKERNEL = ../arm/dot.c
#DDOTKERNEL = ../arm/dot.c
#CDOTKERNEL = ../arm/zdot.c
#ZDOTKERNEL = ../arm/zdot.c
#
#SNRM2KERNEL = ../arm/nrm2.c
#DNRM2KERNEL = ../arm/nrm2.c
#CNRM2KERNEL = ../arm/znrm2.c
#ZNRM2KERNEL = ../arm/znrm2.c
#
#SROTKERNEL = ../arm/rot.c
#DROTKERNEL = ../arm/rot.c
#CROTKERNEL = ../arm/zrot.c
#ZROTKERNEL = ../arm/zrot.c
#
#SSCALKERNEL = ../arm/scal.c
#DSCALKERNEL = ../arm/scal.c
#CSCALKERNEL = ../arm/zscal.c
#ZSCALKERNEL = ../arm/zscal.c
#
#SSWAPKERNEL = ../arm/swap.c
#DSWAPKERNEL = ../arm/swap.c
#CSWAPKERNEL = ../arm/zswap.c
#ZSWAPKERNEL = ../arm/zswap.c
#
#SGEMVNKERNEL = ../arm/gemv_n.c
#DGEMVNKERNEL = ../arm/gemv_n.c
#CGEMVNKERNEL = ../arm/zgemv_n.c
#ZGEMVNKERNEL = ../arm/zgemv_n.c
#
#SGEMVTKERNEL = ../arm/gemv_t.c
#DGEMVTKERNEL = ../arm/gemv_t.c
#CGEMVTKERNEL = ../arm/zgemv_t.c
#ZGEMVTKERNEL = ../arm/zgemv_t.c
#SSYMV_U_KERNEL = ../generic/symv_k.c
#SSYMV_L_KERNEL = ../generic/symv_k.c
#DSYMV_U_KERNEL = ../generic/symv_k.c
#DSYMV_L_KERNEL = ../generic/symv_k.c
#QSYMV_U_KERNEL = ../generic/symv_k.c
#QSYMV_L_KERNEL = ../generic/symv_k.c
#CSYMV_U_KERNEL = ../generic/zsymv_k.c
#CSYMV_L_KERNEL = ../generic/zsymv_k.c
#ZSYMV_U_KERNEL = ../generic/zsymv_k.c
#ZSYMV_L_KERNEL = ../generic/zsymv_k.c
#XSYMV_U_KERNEL = ../generic/zsymv_k.c
#XSYMV_L_KERNEL = ../generic/zsymv_k.c
#ZHEMV_U_KERNEL = ../generic/zhemv_k.c
#ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

64
kernel/power/def_vsx.h Normal file
View File

@ -0,0 +1,64 @@
#define vs0 0
#define vs1 1
#define vs2 2
#define vs3 3
#define vs4 4
#define vs5 5
#define vs6 6
#define vs7 7
#define vs8 8
#define vs9 9
#define vs10 10
#define vs11 11
#define vs12 12
#define vs13 13
#define vs14 14
#define vs15 15
#define vs16 16
#define vs17 17
#define vs18 18
#define vs19 19
#define vs20 20
#define vs21 21
#define vs22 22
#define vs23 23
#define vs24 24
#define vs25 25
#define vs26 26
#define vs27 27
#define vs28 28
#define vs29 29
#define vs30 30
#define vs31 31
#define vs32 32
#define vs33 33
#define vs34 34
#define vs35 35
#define vs36 36
#define vs37 37
#define vs38 38
#define vs39 39
#define vs40 40
#define vs41 41
#define vs42 42
#define vs43 43
#define vs44 44
#define vs45 45
#define vs46 46
#define vs47 47
#define vs48 48
#define vs49 49
#define vs50 50
#define vs51 51
#define vs52 52
#define vs53 53
#define vs54 54
#define vs55 55
#define vs56 56
#define vs57 57
#define vs58 58
#define vs59 59
#define vs60 60
#define vs61 61
#define vs62 62
#define vs63 63

View File

@ -0,0 +1,348 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs18
#define o0 0
#define o8 r15
#define o24 r16
#define ALPHA r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
#endif
stfd f1, ALPHA_SP
stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#endif
cmpwi cr0, M, 0
ble .L999_H1
cmpwi cr0, N, 0
ble .L999_H1
cmpwi cr0, K, 0
ble .L999_H1
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
li PRE, 256
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
lxvdsx alpha_r, 0, ALPHA
#include "dgemm_logic_16x4_power8.S"
.L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,362 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_SP 296(SP)
#define FZERO 304(SP)
#else
#define STACKSIZE 240
#define ALPHA_SP 224(SP)
#define FZERO 232(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r8
#define B r9
#define C r10
#define LDC r7
#define OFFSET r6
#else
#define A r7
#define B r8
#define C r9
#define LDC r10
#define OFFSET r6
#endif
#endif
#define alpha_r vs18
#define o0 0
#define K1 r13
#define KKK r14
#define o8 r15
#define o24 r16
#define ALPHA r17
#define L r18
#define T1 r19
#define KK r20
#define BB r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T2 r31
#include "dgemm_macros_16x4_power8.S"
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
stfd f1, ALPHA_SP
stw r0, FZERO
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
slwi LDC, LDC, BASE_SHIFT
#if defined(TRMMKERNEL)
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#endif
mr KK, OFFSET
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, KK
#endif
cmpwi cr0, M, 0
ble .L999_H1
cmpwi cr0, N, 0
ble .L999_H1
cmpwi cr0, K, 0
ble .L999_H1
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
li PRE, 256
li o8 , 8
li o16, 16
li o24, 24
li o32, 32
li o48, 48
lxvdsx alpha_r, 0, ALPHA
#include "dtrmm_logic_16x4_power8.S"
.L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

View File

@ -107,6 +107,11 @@
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 72
#endif
#ifdef POWER8
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 72
#endif
PROLOGUE
@ -193,7 +198,7 @@ LL(12):
STFD c12, 14 * SIZE(B)
STFD c16, 15 * SIZE(B)
#ifdef POWER6
#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3

View File

@ -111,6 +111,11 @@
#ifdef PPCG4
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 48
#endif
#ifdef POWER8
#define PREFETCHSIZE 16
#define PREFETCHWSIZE 48
#endif
PROLOGUE
@ -224,7 +229,7 @@ LL(12):
STFD c15, 14 * SIZE(B1)
STFD c16, 15 * SIZE(B1)
#ifdef POWER6
#if defined(POWER6) || defined(POWER8)
dcbtst PREA, AO1
dcbtst PREA, AO2
dcbtst PREA, AO3

View File

@ -174,6 +174,12 @@
#define PREFETCHSIZE_C 40
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 96
#define PREFETCHSIZE_C 40
#endif
#ifndef NEEDPARAM
#ifndef __64BIT__

View File

@ -139,6 +139,11 @@
#define PREFETCHSIZE_C 8
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 96
#define PREFETCHSIZE_C 8
#endif
#define y01 f0
#define y02 f1
#define y03 f2

View File

@ -0,0 +1,367 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_r vs30
#define alpha_i vs31
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
#define KK r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
#endif
stfd f1, ALPHA_R_SP
stfd f2, ALPHA_I_SP
stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble .L999
cmpwi cr0, N, 0
ble .L999
cmpwi cr0, K, 0
ble .L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
lxvdsx alpha_r, 0, ALPHA
lxvdsx alpha_i, o8, ALPHA
.align 5
#include "zgemm_logic_8x2_power8.S"
.L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

View File

@ -0,0 +1,901 @@
srawi. J, N, 1
ble .LZGEMM_L2_END
.LZGEMM_L2_BEGIN:
mr CO, C
mr AO, A
slwi T1, LDC , 1
add C, C, T1
srawi. I, M, 3
ble .LZGEMM_L2x8_END
.LZGEMM_L2x8_BEGIN:
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L2x8_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x8_SUB4
.LZGEMM_L2x8_LOOP_START:
dcbt AO, PRE
LOAD2x8_1
dcbt AO, PRE
KERNEL2x8_I1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -2
ble .LZGEMM_L2x8_LOOP_END
.align 5
.LZGEMM_L2x8_LOOP:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
addic. L, L, -1
bgt .LZGEMM_L2x8_LOOP
.LZGEMM_L2x8_LOOP_END:
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
dcbt AO, PRE
KERNEL2x8_2
dcbt AO, PRE
KERNEL2x8_1
KERNEL2x8_E2
b .LZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB4:
dcbt AO, PRE
KERNEL2x8_SUBI1
dcbt AO, PRE
KERNEL2x8_SUB1
dcbt AO, PRE
KERNEL2x8_SUB1
dcbt AO, PRE
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
KERNEL2x8_SUB1
b .LZGEMM_L2x8_SUB1
.LZGEMM_L2x8_SUB0:
andi. L, K, 7
KERNEL2x8_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x8_SAVE
b .LZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x8_SAVE
.LZGEMM_L2x8_SUB2:
KERNEL2x8_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x8_SUB2
.LZGEMM_L2x8_SAVE:
SAVE2x8
addic. I, I, -1
bgt .LZGEMM_L2x8_BEGIN
.LZGEMM_L2x8_END:
.LZGEMM_L2x4_BEGIN:
andi. T2, M, 7
ble .LZGEMM_L2x1_END
andi. T1, M, 4
ble .LZGEMM_L2x4_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L2x4_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x4_SUB4
.LZGEMM_L2x4_LOOP_START:
LOAD2x4_1
KERNEL2x4_I1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
addic. L, L, -2
ble .LZGEMM_L2x4_LOOP_END
.align 5
.LZGEMM_L2x4_LOOP:
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
addic. L, L, -1
bgt .LZGEMM_L2x4_LOOP
.LZGEMM_L2x4_LOOP_END:
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_2
KERNEL2x4_1
KERNEL2x4_E2
b .LZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB4:
KERNEL2x4_SUBI1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
KERNEL2x4_SUB1
b .LZGEMM_L2x4_SUB1
.LZGEMM_L2x4_SUB0:
andi. L, K, 7
KERNEL2x4_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x4_SAVE
b .LZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x4_SAVE
.LZGEMM_L2x4_SUB2:
KERNEL2x4_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x4_SUB2
.LZGEMM_L2x4_SAVE:
SAVE2x4
.LZGEMM_L2x4_END:
.LZGEMM_L2x2_BEGIN:
andi. T1, M, 2
ble .LZGEMM_L2x2_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L2x2_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x2_SUB4
.LZGEMM_L2x2_LOOP_START:
LOAD2x2_1
KERNEL2x2_I1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
addic. L, L, -2
ble .LZGEMM_L2x2_LOOP_END
.align 5
.LZGEMM_L2x2_LOOP:
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
addic. L, L, -1
bgt .LZGEMM_L2x2_LOOP
.LZGEMM_L2x2_LOOP_END:
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_2
KERNEL2x2_1
KERNEL2x2_E2
b .LZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB4:
KERNEL2x2_SUBI1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
KERNEL2x2_SUB1
b .LZGEMM_L2x2_SUB1
.LZGEMM_L2x2_SUB0:
andi. L, K, 7
KERNEL2x2_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x2_SAVE
b .LZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x2_SAVE
.LZGEMM_L2x2_SUB2:
KERNEL2x2_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x2_SUB2
.LZGEMM_L2x2_SAVE:
SAVE2x2
.LZGEMM_L2x2_END:
.LZGEMM_L2x1_BEGIN:
andi. T1, M, 1
ble .LZGEMM_L2x1_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L2x1_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L2x1_SUB4
.LZGEMM_L2x1_LOOP_START:
LOAD2x1_1
KERNEL2x1_I1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
addic. L, L, -2
ble .LZGEMM_L2x1_LOOP_END
.align 5
.LZGEMM_L2x1_LOOP:
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
addic. L, L, -1
bgt .LZGEMM_L2x1_LOOP
.LZGEMM_L2x1_LOOP_END:
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_2
KERNEL2x1_1
KERNEL2x1_E2
b .LZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB4:
KERNEL2x1_SUBI1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
KERNEL2x1_SUB1
b .LZGEMM_L2x1_SUB1
.LZGEMM_L2x1_SUB0:
andi. L, K, 7
KERNEL2x1_SUBI1
addic. L, L, -1
ble .LZGEMM_L2x1_SAVE
b .LZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SUB1:
andi. L, K, 7
ble .LZGEMM_L2x1_SAVE
.LZGEMM_L2x1_SUB2:
KERNEL2x1_SUB1
addic. L, L, -1
bgt .LZGEMM_L2x1_SUB2
.LZGEMM_L2x1_SAVE:
SAVE2x1
.LZGEMM_L2x1_END:
slwi T1, K, 5
add B, B, T1
addic. J, J, -1
bgt .LZGEMM_L2_BEGIN
andi. T2, N, 1
ble .L999
.LZGEMM_L2_END:
b .LZGEMM_L1_BEGIN
.L999_H1:
b .L999
.LZGEMM_L1_BEGIN:
andi. T1, N, 1
ble .LZGEMM_L1_END
mr CO, C
mr AO, A
srawi. I, M, 3
ble .LZGEMM_L1x8_END
.LZGEMM_L1x8_BEGIN:
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L1x8_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x8_SUB4
.LZGEMM_L1x8_LOOP_START:
dcbt AO, PRE
LOAD1x8_1
dcbt AO, PRE
KERNEL1x8_I1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -2
ble .LZGEMM_L1x8_LOOP_END
.align 5
.LZGEMM_L1x8_LOOP:
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
addic. L, L, -1
bgt .LZGEMM_L1x8_LOOP
.LZGEMM_L1x8_LOOP_END:
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
dcbt AO, PRE
KERNEL1x8_2
dcbt AO, PRE
KERNEL1x8_1
KERNEL1x8_E2
b .LZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB4:
dcbt AO, PRE
KERNEL1x8_SUBI1
dcbt AO, PRE
KERNEL1x8_SUB1
dcbt AO, PRE
KERNEL1x8_SUB1
dcbt AO, PRE
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
KERNEL1x8_SUB1
b .LZGEMM_L1x8_SUB1
.LZGEMM_L1x8_SUB0:
andi. L, K, 7
KERNEL1x8_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x8_SAVE
b .LZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x8_SAVE
.LZGEMM_L1x8_SUB2:
KERNEL1x8_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x8_SUB2
.LZGEMM_L1x8_SAVE:
SAVE1x8
addic. I, I, -1
bgt .LZGEMM_L1x8_BEGIN
.LZGEMM_L1x8_END:
.LZGEMM_L1x4_BEGIN:
andi. T2, M, 7
ble .LZGEMM_L1x1_END
andi. T1, M, 4
ble .LZGEMM_L1x4_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L1x4_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x4_SUB4
.LZGEMM_L1x4_LOOP_START:
LOAD1x4_1
KERNEL1x4_I1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
addic. L, L, -2
ble .LZGEMM_L1x4_LOOP_END
.align 5
.LZGEMM_L1x4_LOOP:
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
addic. L, L, -1
bgt .LZGEMM_L1x4_LOOP
.LZGEMM_L1x4_LOOP_END:
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_2
KERNEL1x4_1
KERNEL1x4_E2
b .LZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB4:
KERNEL1x4_SUBI1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
KERNEL1x4_SUB1
b .LZGEMM_L1x4_SUB1
.LZGEMM_L1x4_SUB0:
andi. L, K, 7
KERNEL1x4_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x4_SAVE
b .LZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x4_SAVE
.LZGEMM_L1x4_SUB2:
KERNEL1x4_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x4_SUB2
.LZGEMM_L1x4_SAVE:
SAVE1x4
.LZGEMM_L1x4_END:
.LZGEMM_L1x2_BEGIN:
andi. T1, M, 2
ble .LZGEMM_L1x2_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L1x2_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x2_SUB4
.LZGEMM_L1x2_LOOP_START:
LOAD1x2_1
KERNEL1x2_I1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
addic. L, L, -2
ble .LZGEMM_L1x2_LOOP_END
.align 5
.LZGEMM_L1x2_LOOP:
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
addic. L, L, -1
bgt .LZGEMM_L1x2_LOOP
.LZGEMM_L1x2_LOOP_END:
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_2
KERNEL1x2_1
KERNEL1x2_E2
b .LZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB4:
KERNEL1x2_SUBI1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
KERNEL1x2_SUB1
b .LZGEMM_L1x2_SUB1
.LZGEMM_L1x2_SUB0:
andi. L, K, 7
KERNEL1x2_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x2_SAVE
b .LZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x2_SAVE
.LZGEMM_L1x2_SUB2:
KERNEL1x2_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x2_SUB2
.LZGEMM_L1x2_SAVE:
SAVE1x2
.LZGEMM_L1x2_END:
.LZGEMM_L1x1_BEGIN:
andi. T1, M, 1
ble .LZGEMM_L1x1_END
mr BO, B
srawi. L, K, 3
ble .LZGEMM_L1x1_SUB0
cmpwi cr0, L, 1
ble .LZGEMM_L1x1_SUB4
.LZGEMM_L1x1_LOOP_START:
LOAD1x1_1
KERNEL1x1_I1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
addic. L, L, -2
ble .LZGEMM_L1x1_LOOP_END
.align 5
.LZGEMM_L1x1_LOOP:
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
addic. L, L, -1
bgt .LZGEMM_L1x1_LOOP
.LZGEMM_L1x1_LOOP_END:
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_2
KERNEL1x1_1
KERNEL1x1_E2
b .LZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB4:
KERNEL1x1_SUBI1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
KERNEL1x1_SUB1
b .LZGEMM_L1x1_SUB1
.LZGEMM_L1x1_SUB0:
andi. L, K, 7
KERNEL1x1_SUBI1
addic. L, L, -1
ble .LZGEMM_L1x1_SAVE
b .LZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SUB1:
andi. L, K, 7
ble .LZGEMM_L1x1_SAVE
.LZGEMM_L1x1_SUB2:
KERNEL1x1_SUB1
addic. L, L, -1
bgt .LZGEMM_L1x1_SUB2
.LZGEMM_L1x1_SAVE:
SAVE1x1
.LZGEMM_L1x1_END:
.LZGEMM_L1_END:

File diff suppressed because it is too large Load Diff

View File

@ -170,6 +170,11 @@
#define PREFETCHSIZE_C 24
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 24
#define PREFETCHSIZE_C 24
#endif
#ifndef XCONJ
#define FMADDR FMADD
#define FMSUBR FNMSUB

View File

@ -144,6 +144,12 @@
#define PREFETCHSIZE_C 8
#endif
#ifdef POWER8
#define PREFETCHSIZE_A 24
#define PREFETCHSIZE_C 8
#endif
#if !(defined(CONJ) && defined(XCONJ))
#define FMADDR FMADD
#define FMSUBR FNMSUB

View File

@ -0,0 +1,377 @@
/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/03/05 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#ifndef __64BIT__
#define LOAD lwz
#else
#define LOAD ld
#endif
#ifdef __64BIT__
#define STACKSIZE 320
#define ALPHA_R_SP 296(SP)
#define ALPHA_I_SP 304(SP)
#define FZERO 312(SP)
#else
#define STACKSIZE 256
#define ALPHA_R_SP 224(SP)
#define ALPHA_I_SP 232(SP)
#define FZERO 240(SP)
#endif
#define M r3
#define N r4
#define K r5
#ifdef linux
#ifndef __64BIT__
#define A r6
#define B r7
#define C r8
#define LDC r9
#define OFFSET r10
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#if !defined(__64BIT__) && defined(DOUBLE)
#define A r10
#define B r6
#define C r7
#define LDC r8
#define OFFSET r9
#else
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#endif
#endif
#define o0 0
#define alpha_r vs30
#define alpha_i vs31
#define KKK r13
#define K1 r14
#define L r15
#define ALPHA r16
#define o24 r17
#define T2 r19
#define KK r20
#define o8 r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define o16 r27
#define o32 r28
#define o48 r29
#define PRE r30
#define T1 r31
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
#ifdef __64BIT__
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
std r13, 288(SP)
#else
stw r31, 144(SP)
stw r30, 148(SP)
stw r29, 152(SP)
stw r28, 156(SP)
stw r27, 160(SP)
stw r26, 164(SP)
stw r25, 168(SP)
stw r24, 172(SP)
stw r23, 176(SP)
stw r22, 180(SP)
stw r21, 184(SP)
stw r20, 188(SP)
stw r19, 192(SP)
stw r18, 196(SP)
stw r17, 200(SP)
stw r16, 204(SP)
stw r15, 208(SP)
stw r14, 212(SP)
stw r13, 216(SP)
#endif
stfd f1, ALPHA_R_SP
stfd f2, ALPHA_I_SP
stw r0, FZERO
#ifdef linux
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz B, FRAMESLOT(0) + STACKSIZE(SP)
lwz C, FRAMESLOT(1) + STACKSIZE(SP)
lwz LDC, FRAMESLOT(2) + STACKSIZE(SP)
#else
lwz LDC, FRAMESLOT(0) + STACKSIZE(SP)
#endif
#endif
#endif
#ifdef TRMMKERNEL
#if defined(linux) && defined(__64BIT__)
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#if defined(_AIX) || defined(__APPLE__)
#ifdef __64BIT__
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#else
#ifdef DOUBLE
lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP)
#else
lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
#endif
#endif
#if defined(TRMMKERNEL) && !defined(LEFT)
neg KK, OFFSET
#endif
#endif
#include "zgemm_macros_8x2_power8.S"
cmpwi cr0, M, 0
ble .L999
cmpwi cr0, N, 0
ble .L999
cmpwi cr0, K, 0
ble .L999
slwi LDC, LDC, ZBASE_SHIFT
li PRE, 256
li o8 , 8
li o16 , 16
li o24 , 24
li o32 , 32
li o48 , 48
#ifdef __64BIT__
addi ALPHA, SP, 296
#else
addi ALPHA, SP, 224
#endif
lxsdx alpha_r, 0, ALPHA
lxsdx alpha_i, o8, ALPHA
.align 4
#include "ztrmm_logic_8x2_power8.S"
.L999:
addi r3, 0, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
#ifdef __64BIT__
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r13, 288(SP)
#else
lwz r31, 144(SP)
lwz r30, 148(SP)
lwz r29, 152(SP)
lwz r28, 156(SP)
lwz r27, 160(SP)
lwz r26, 164(SP)
lwz r25, 168(SP)
lwz r24, 172(SP)
lwz r23, 176(SP)
lwz r22, 180(SP)
lwz r21, 184(SP)
lwz r20, 188(SP)
lwz r19, 192(SP)
lwz r18, 196(SP)
lwz r17, 200(SP)
lwz r16, 204(SP)
lwz r15, 208(SP)
lwz r14, 212(SP)
lwz r13, 216(SP)
#endif
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

37
param.h
View File

@ -1959,6 +1959,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#if defined(POWER8)
#define SNUMOPT 4
#define DNUMOPT 8
#define GEMM_DEFAULT_OFFSET_A 384
#define GEMM_DEFAULT_OFFSET_B 1024
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 8
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 992
#define DGEMM_DEFAULT_P 480
#define CGEMM_DEFAULT_P 488
#define ZGEMM_DEFAULT_P 240
#define SGEMM_DEFAULT_Q 504
#define DGEMM_DEFAULT_Q 720
#define CGEMM_DEFAULT_Q 400
#define ZGEMM_DEFAULT_Q 360
#define SGEMM_DEFAULT_R 28800
#define DGEMM_DEFAULT_R 14400
#define ZGEMM_DEFAULT_R 7200
#define SYMV_P 8
#endif
#if defined(SPARC) && defined(V7)
#define SNUMOPT 4