Merge pull request #2172 from quickwritereader/develop

power9 cgemm/ctrmm. new sgemm 8x16
This commit is contained in:
Martin Kroeker 2019-07-01 21:06:02 +02:00 committed by GitHub
commit 6b6c9b1441
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 6422 additions and 257 deletions

View File

@ -5,7 +5,7 @@
STRMMKERNEL = sgemm_kernel_power9.S
DTRMMKERNEL = dgemm_kernel_power9.S
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
CTRMMKERNEL = cgemm_kernel_power9.S
ZTRMMKERNEL = zgemm_kernel_power9.S
SGEMMKERNEL = sgemm_kernel_power9.S
@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
CGEMMKERNEL = cgemm_kernel_power9.S
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = cgemm_tcopy_8_power8.S
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)

View File

@ -0,0 +1,293 @@
/***************************************************************************
Copyright (c) 2013-2019, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* Abdelrauf(quickwritereader@gmail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "def_vsx.h"
#define LOAD ld
#define STACKSIZE (512 )
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
#define M r3
#define N r4
#define K r5
#define A r8
#define B r9
#define C r10
#define LDC r6
#define OFFSET r7
#define alpha_r vs19
#define alpha_i vs20
#define save_permute_1 vs21
#define permute_mask vs22
#define o0 0
#define T1 r11
#define T2 r12
#define T3 r14
#define T4 r15
#define T5 r16
#define T6 r17
#define L r18
#define T7 r19
#define T8 r20
#define TEMP_REG r21
#define I r22
#define J r23
#define AO r24
#define BO r25
#define CO r26
#define T9 r27
#define T10 r28
#define PRE r29
#define T12 r30
#define T13 r31
#include "cgemm_macros_power9.S"
.equ perm_const1, 0x0405060700010203
.equ perm_const2, 0x0c0d0e0f08090a0b
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
.equ save_permute_11, 0x0405060714151617
#ifndef NEEDPARAM
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
mflr r0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
stxv vs52, 288(SP)
stxv vs53, 304(SP)
stxv vs54, 320(SP)
stxv vs55, 336(SP)
stxv vs56, 352(SP)
stxv vs57, 368(SP)
stxv vs58, 384(SP)
stxv vs59, 400(SP)
stxv vs60, 416(SP)
stxv vs61, 432(SP)
stxv vs62, 448(SP)
stxv vs63, 464(SP)
std r0, FLINK_SAVE(SP)
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
#ifdef TRMMKERNEL
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
#endif
slwi LDC, LDC, ZBASE_SHIFT
/*alpha is stored in f1. convert to single and splat*/
xscvdpspn alpha_r,vs1
xscvdpspn alpha_i,vs2
xxspltw alpha_r,alpha_r,0
xxspltw alpha_i,alpha_i,0
/*load reverse permute mask for big endian
uint128 = 0xc0d0e0f08090a0b0405060700010203
*/
lis T2, perm_const2@highest
lis T1, perm_const1@highest
lis T3, save_permute_12@highest
lis T4, save_permute_11@highest
ori T2, T2, perm_const2@higher
ori T1, T1, perm_const1@higher
ori T3, T3, save_permute_12@higher
ori T4, T4, save_permute_11@higher
rldicr T2, T2, 32, 31
rldicr T1, T1, 32, 31
rldicr T3, T3, 32, 31
rldicr T4, T4, 32, 31
oris T2, T2, perm_const2@h
oris T1, T1, perm_const1@h
oris T3, T3, save_permute_12@h
oris T4, T4, save_permute_11@h
ori T2, T2, perm_const2@l
ori T1, T1, perm_const1@l
ori T3, T3, save_permute_12@l
ori T4, T4, save_permute_11@l
li r0,0
li PRE,512
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
/*negate for this case as we will use addition -1*(a+b) */
xvnegsp alpha_r,alpha_r
xvnegsp alpha_i,alpha_i
#endif
mtvsrdd permute_mask,T2,T1
mtvsrdd save_permute_1,T3,T4
/*mask is reverse permute so we have to make it inner permute */
xxpermdi permute_mask, permute_mask, permute_mask,2
#include "cgemm_logic_power9.S"
.L999:
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
ld r0, FLINK_SAVE(SP)
lxv vs52, 288(SP)
lxv vs53, 304(SP)
lxv vs54, 320(SP)
lxv vs55, 336(SP)
lxv vs56, 352(SP)
lxv vs57, 368(SP)
lxv vs58, 384(SP)
lxv vs59, 400(SP)
mtlr r0
lxv vs60, 416(SP)
lxv vs61, 432(SP)
lxv vs62, 448(SP)
lxv vs63, 464(SP)
addi SP, SP, STACKSIZE
blr
EPILOGUE
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -3,89 +3,89 @@ b L8
MY_ALIGN
LSGEMM_L8x16_LMAIN_SUB:
LOAD8x16_0
mtctr L
LOAD8x16_2
MY_ALIGN
LSGEMM_L8x16_LOOP:
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_2 64,32, 15,0
KERNEL8x16_I1_L4_2 64,32, 16,0
KERNEL8x16_I1_L4_2 64,32, 17,0
KERNEL8x16_I1_L4_2 64,32, 18,0
KERNEL8x16_I1_L4_2 64,32, 19,0
KERNEL8x16_I1_L4_2 64,32, 20,0
KERNEL8x16_I1_L4_2 64,32, 21,0
KERNEL8x16_I1_L4_2 64,32, 22,0
KERNEL8x16_I1_L4_2 64,32, 23,0
KERNEL8x16_I1_L4_2 64,32, 24,0
KERNEL8x16_I1_L4_2 64,32, 25,0
KERNEL8x16_I1_L4_2 64,32, 26,0
KERNEL8x16_I1_L4_2 64,32, 27,0
KERNEL8x16_I1_L4_2 64,32, 28,0
KERNEL8x16_I1_L4_2 64,32, 29,0
KERNEL8x16_I1_L4_2 64,32, 30,0
KERNEL8x16_I1_L4_2 64,32, 31,1
KERNEL8x16_L2 128,64,0,0
LSGEMM_L8x16_K128:
KERNEL8x16_L2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64, 3,0
KERNEL8x16_I1_L4_2 128,64, 4,0
KERNEL8x16_I1_L4_2 128,64, 5,0
KERNEL8x16_I1_L4_2 128,64, 6,0
KERNEL8x16_I1_L4_2 128,64, 7,0
KERNEL8x16_I1_L4_2 128,64, 8,0
KERNEL8x16_I1_L4_2 128,64, 9,0
KERNEL8x16_I1_L4_2 128,64, 10,0
KERNEL8x16_I1_L4_2 128,64, 11,0
KERNEL8x16_I1_L4_2 128,64, 12,0
KERNEL8x16_I1_L4_2 128,64, 13,0
KERNEL8x16_I1_L4_2 128,64, 14,0
KERNEL8x16_I1_L4_2 128,64, 15,0
KERNEL8x16_I1_L4_2 128,64, 16,0
KERNEL8x16_I1_L4_2 128,64, 17,0
KERNEL8x16_I1_L4_2 128,64, 18,0
KERNEL8x16_I1_L4_2 128,64, 19,0
KERNEL8x16_I1_L4_2 128,64, 20,0
KERNEL8x16_I1_L4_2 128,64, 21,0
KERNEL8x16_I1_L4_2 128,64, 22,0
KERNEL8x16_I1_L4_2 128,64, 23,0
KERNEL8x16_I1_L4_2 128,64, 24,0
KERNEL8x16_I1_L4_2 128,64, 25,0
KERNEL8x16_I1_L4_2 128,64, 26,0
KERNEL8x16_I1_L4_2 128,64, 27,0
KERNEL8x16_I1_L4_2 128,64, 28,0
KERNEL8x16_I1_L4_2 128,64, 29,0
KERNEL8x16_I1_L4_2 128,64, 30,0
KERNEL8x16_I1_L4_2 128,64, 31,1
bdnz LSGEMM_L8x16_LOOP
MY_ALIGN
LSGEMM_L8x16_LOOP_END:
END8x16 0, AO, BO, 64, 32
END8x16_2
blr
MY_ALIGN
LSGEMM_L8x16_L64_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_2 64,32, 7,0
KERNEL8x16_I1_L4_2 64,32, 8,0
KERNEL8x16_I1_L4_2 64,32, 9,0
KERNEL8x16_I1_L4_2 64,32, 10,0
KERNEL8x16_I1_L4_2 64,32, 11,0
KERNEL8x16_I1_L4_2 64,32, 12,0
KERNEL8x16_I1_L4_2 64,32, 13,0
KERNEL8x16_I1_L4_2 64,32, 14,0
KERNEL8x16_I1_L4_3 64,32, 15,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_2 128,64, 1,0
KERNEL8x16_I1_L4_2 128,64, 2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_2 128,64,7,0
KERNEL8x16_I1_L4_2 128,64,8,0
KERNEL8x16_I1_L4_2 128,64,9,0
KERNEL8x16_I1_L4_2 128,64,10,0
KERNEL8x16_I1_L4_2 128,64,11,0
KERNEL8x16_I1_L4_2 128,64,12,0
KERNEL8x16_I1_L4_2 128,64,13,0
KERNEL8x16_I1_L4_2 128,64,14,0
KERNEL8x16_I1_L4_3 128,64,15,1
blr
LSGEMM_L8x16_L32_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_2 64,32, 3,0
KERNEL8x16_I1_L4_2 64,32, 4,0
KERNEL8x16_I1_L4_2 64,32, 5,0
KERNEL8x16_I1_L4_2 64,32, 6,0
KERNEL8x16_I1_L4_3 64,32, 7,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_2 128,64,3,0
KERNEL8x16_I1_L4_2 128,64,4,0
KERNEL8x16_I1_L4_2 128,64,5,0
KERNEL8x16_I1_L4_2 128,64,6,0
KERNEL8x16_I1_L4_3 128,64,7,1
blr
LSGEMM_L8x16_L16_SUB:
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_2 64,32, 1,0
KERNEL8x16_I1_L4_2 64,32, 2,0
KERNEL8x16_I1_L4_3 64,32, 3,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64,0,0
KERNEL8x16_I1_L4_2 128,64,1,0
KERNEL8x16_I1_L4_2 128,64,2,0
KERNEL8x16_I1_L4_3 128,64,3,1
blr
L8:
@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
#if defined(TRMMKERNEL)
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
mr T12, T11
addi T12,T12, -1
srawi. L, T12, 7 /**(T11-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(T11-2) % 128x */
#else
mr T12, K
addi T12,T12, -1
srawi. L, T12, 7 /**(K-1) % 128x */
addi T12,T12, -2
srawi. L, T12, 7 /**(K-2) % 128x */
#endif
ZERO8x16
ZERO8x16
mtctr L
ble LSGEMM_L8x16_SUB0
bl LSGEMM_L8x16_LMAIN_SUB
andi. L, T12, 127
@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
cmpwi T11,128
#else
andi. L, K, 255
cmpwi K,129
#endif
li T10,1
bne CMP8x16_128K
addi BO,BO,-32
addi AO,AO,-64
LOAD8x16 64,32
END8x16_WITHOUT_ADD
LOAD8x16_2O AO,BO, 128, 64
mtctr T10
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
CMP8x16_128K:
/*----------------------------------------*/
#if defined(TRMMKERNEL)
cmpwi T11,128
#else
cmpwi K,128
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
LSGEMM_L8x16_SUB2_128:
bl LSGEMM_L8x16_L64_SUB
bl LSGEMM_L8x16_L64_SUB
b LSGEMM_L8x16_SAVE
#endif
bne LSGEMM_L8x16_SUB2
MY_ALIGN
mtctr T10
addi BO,BO,-64
addi AO,AO,-128
LOAD8x16_2O AO,BO, 128,64
bl LSGEMM_L8x16_K128
b LSGEMM_L8x16_SAVE
MY_ALIGN
LSGEMM_L8x16_SUB2:
andi. T10,L,64
@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
LSGEMM_L8x16_SUB2_8:
andi. T10,L, 8
ble LSGEMM_L8x16_SUB2_4
LOAD8x16_0
KERNEL8x16_I1_L4_2 64,32, 0,0
KERNEL8x16_I1_L4_3 64,32, 1,1
LOAD8x16_2
KERNEL8x16_I1_L4_2 128,64, 0,0
KERNEL8x16_I1_L4_3 128,64, 1,1
MY_ALIGN
LSGEMM_L8x16_SUB2_4:
andi. T10,L, 4
ble LSGEMM_L8x16_SUB2_2
LOAD8x16_0
KERNEL8x16_I1_L4_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_I1_L4_3 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_2:
andi. T10,L, 2
ble LSGEMM_L8x16_SUB2_1
LOAD8x16_0
KERNEL8x16_I1_L2_3 64,32, 0,1
LOAD8x16_2
KERNEL8x16_E2 128,64, 0,1
MY_ALIGN
LSGEMM_L8x16_SUB2_1:
andi. T10,L, 1

View File

@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=8 and M=16
**********************************************************************************************/
.macro LOAD8x16_1
LOAD8x16 1
.endm
.macro LOAD8x16_0
LOAD8x16 0
.endm
.macro KERNEL8x16_L1_L4 Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
.endm
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
.endm
@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxlxor vs63, vs63, vs63
.endm
.macro LOAD8x16 Zero
.macro LOAD8x16 OffsetA,OffsetB
lxv vs24, 0(BO)
lxv vs28, 16(BO)
lxv vs24, (\OffsetB+0)(BO)
lxv vs28, (\OffsetB+16)(BO)
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
lxv vs0, 0(AO)
lxv vs1, 16(AO)
lxv vs0, (\OffsetA+0)(AO)
lxv vs1, (\OffsetA+16)(AO)
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
lxv vs2, 32(AO)
lxv vs3, 48(AO)
lxv vs2, (\OffsetA+32)(AO)
lxv vs3, (\OffsetA+48)(AO)
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.if \Zero==1
xxlxor vs32, vs32, vs32
xxlxor vs33, vs33, vs33
xxlxor vs34, vs34, vs34
xxlxor vs35, vs35, vs35
xxlxor vs36, vs36, vs36
xxlxor vs37, vs37, vs37
xxlxor vs38, vs38, vs38
xxlxor vs39, vs39, vs39
xxlxor vs40, vs40, vs40
xxlxor vs41, vs41, vs41
xxlxor vs42, vs42, vs42
xxlxor vs43, vs43, vs43
xxlxor vs44, vs44, vs44
xxlxor vs45, vs45, vs45
xxlxor vs46, vs46, vs46
xxlxor vs47, vs47, vs47
xxlxor vs48, vs48, vs48
xxlxor vs49, vs49, vs49
xxlxor vs50, vs50, vs50
xxlxor vs51, vs51, vs51
xxlxor vs52, vs52, vs52
xxlxor vs53, vs53, vs53
xxlxor vs54, vs54, vs54
xxlxor vs55, vs55, vs55
xxlxor vs56, vs56, vs56
xxlxor vs57, vs57, vs57
xxlxor vs58, vs58, vs58
xxlxor vs59, vs59, vs59
xxlxor vs60, vs60, vs60
xxlxor vs61, vs61, vs61
xxlxor vs62, vs62, vs62
xxlxor vs63, vs63, vs63
.endif
.endm
.macro END8x16_NORMAL
END8x16 0, AO, BO, 64,32
.endm
.macro END8x16_WITHOUT_ADD
END8x16 0, AO,BO,0,0
.endm
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
.endm
.macro KERNEL8x16 First
LOAD8x16 0
LOAD8x16 0,0
END8x16 \First, AO, BO, 64,32
.endm
.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
.macro LOAD8x16_2
LOAD8x16_2O AO,BO, 0,0
.endm
.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
lxv vs8, (\OffsetB)(\BREG)
lxv vs12, (16+\OffsetB)(\BREG)
lxv vs24, (32+\OffsetB)(\BREG)
lxv vs28, (32+16+\OffsetB)(\BREG)
lxv vs4, (0+\OffsetA)(\AREG)
lxv vs5, (16+\OffsetA)(\AREG)
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
lxv vs6, (32+\OffsetA)(\AREG)
lxv vs7, (48+\OffsetA)(\AREG)
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
lxv vs0, (64+\OffsetA)(\AREG)
lxv vs1, (64+16+\OffsetA)(\AREG)
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
lxv vs2, (64+32+\OffsetA)(\AREG)
lxv vs3, (64+48+\OffsetA)(\AREG)
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endm
.macro END8x16_2
/*for load2 offset will be 128 and 64*/
KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
.endm
.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
xvmaddasp vs32, vs4,vs8
xvmaddasp vs33, vs5,vs8
xvmaddasp vs48, vs4,vs12
xvmaddasp vs49, vs5,vs12
xvmaddasp vs40, vs4,vs10
xvmaddasp vs41, vs5,vs10
xvmaddasp vs56, vs4,vs14
xvmaddasp vs57, vs5,vs14
xvmaddasp vs36, vs4,vs9
xvmaddasp vs37, vs5,vs9
xvmaddasp vs52, vs4,vs13
xvmaddasp vs53, vs5,vs13
xvmaddasp vs44, vs4,vs11
xvmaddasp vs45, vs5,vs11
xvmaddasp vs60, vs4,vs15
xvmaddasp vs61, vs5,vs15
.if \Complete==0
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
.endif
xvmaddasp vs34, vs6,vs8
xvmaddasp vs35, vs7,vs8
xvmaddasp vs50, vs6,vs12
xvmaddasp vs51, vs7,vs12
.if \Complete==0
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
.endif
xvmaddasp vs42, vs6,vs10
xvmaddasp vs43, vs7,vs10
xvmaddasp vs58, vs6,vs14
xvmaddasp vs59, vs7,vs14
.if \Complete==0
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
.endif
xvmaddasp vs38, vs6,vs9
xvmaddasp vs39, vs7,vs9
xvmaddasp vs54, vs6,vs13
xvmaddasp vs55, vs7,vs13
.if \Complete==0
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
.endif
xvmaddasp vs46, vs6,vs11
xvmaddasp vs47, vs7,vs11
xvmaddasp vs62, vs6,vs15
xvmaddasp vs63, vs7,vs15
.if \Complete==0
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
.endif
.if \Complete==0
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
.endif
xvmaddasp vs32, vs0,vs24
xvmaddasp vs36, vs0,vs25
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
xxperm vs10, vs8, permute_mask
xxperm vs14, vs12, permute_mask
xvmaddasp vs40, vs0,vs26
xvmaddasp vs44, vs0,vs27
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
xvmaddasp vs33, vs1,vs24
xvmaddasp vs48, vs0,vs28
xvmaddasp vs52, vs0,vs29
xxpermdi vs9, vs8, vs8,2
xxpermdi vs13, vs12, vs12,2
xvmaddasp vs49, vs1,vs28
xvmaddasp vs40, vs0,vs26
xvmaddasp vs41, vs1,vs26
xvmaddasp vs56, vs0,vs30
xvmaddasp vs57, vs1,vs30
xvmaddasp vs36, vs0,vs25
xvmaddasp vs37, vs1,vs25
xvmaddasp vs52, vs0,vs29
xvmaddasp vs53, vs1,vs29
xvmaddasp vs44, vs0,vs27
xvmaddasp vs45, vs1,vs27
xvmaddasp vs60, vs0,vs31
xxpermdi vs11, vs10, vs10,2
xxpermdi vs15, vs14, vs14,2
xvmaddasp vs33, vs1,vs24
xvmaddasp vs37, vs1,vs25
xvmaddasp vs41, vs1,vs26
xvmaddasp vs45, vs1,vs27
xvmaddasp vs49, vs1,vs28
xvmaddasp vs53, vs1,vs29
xvmaddasp vs57, vs1,vs30
xvmaddasp vs61, vs1,vs31
xvmaddasp vs61, vs1,vs31
.if \Complete==0
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
.endif
xvmaddasp vs34, vs2,vs24
xvmaddasp vs38, vs2,vs25
xvmaddasp vs42, vs2,vs26
xvmaddasp vs46, vs2,vs27
xvmaddasp vs50, vs2,vs28
xvmaddasp vs54, vs2,vs29
xvmaddasp vs58, vs2,vs30
xvmaddasp vs62, vs2,vs31
xvmaddasp vs35, vs3,vs24
xvmaddasp vs39, vs3,vs25
xvmaddasp vs43, vs3,vs26
xvmaddasp vs47, vs3,vs27
xvmaddasp vs51, vs3,vs28
xvmaddasp vs55, vs3,vs29
xvmaddasp vs59, vs3,vs30
xvmaddasp vs63, vs3,vs31
.if \Complete==0
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
xvmaddasp vs32, vs4,vs8
xvmaddasp vs36, vs4,vs9
xvmaddasp vs34, vs2,vs24
xvmaddasp vs35, vs3,vs24
xvmaddasp vs50, vs2,vs28
xvmaddasp vs51, vs3,vs28
.if \Complete==0
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
.endif
xvmaddasp vs42, vs2,vs26
xvmaddasp vs43, vs3,vs26
xvmaddasp vs58, vs2,vs30
xvmaddasp vs59, vs3,vs30
.if \Complete==0
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
.endif
xvmaddasp vs38, vs2,vs25
xvmaddasp vs39, vs3,vs25
xvmaddasp vs54, vs2,vs29
xvmaddasp vs55, vs3,vs29
.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
.endif
xvmaddasp vs46, vs2,vs27
xvmaddasp vs47, vs3,vs27
xvmaddasp vs62, vs2,vs31
xvmaddasp vs63, vs3,vs31
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endif
.if \Complete==0
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
.endif
.if \IsLast==1
.if \Complete==1
addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
addi \BREG, \BREG, DISP16(\Index,\OffsetB)
addi \AREG, \AREG, DISP32(\Index,\OffsetA)
.else
addi \AREG, \AREG, DISP32(\Index,128)
addi \BREG, \BREG, DISP16(\Index,64)
addi \AREG, \AREG, DISP32(\Index,128)
.endif
.endif
xvmaddasp vs40, vs4,vs10
xvmaddasp vs44, vs4,vs11
.if \Complete==0
xxperm vs26, vs24, permute_mask
xxperm vs30, vs28, permute_mask
.endif
xvmaddasp vs48, vs4,vs12
xvmaddasp vs52, vs4,vs13
.if \Complete==0
xxpermdi vs25, vs24, vs24,2
xxpermdi vs29, vs28, vs28,2
.endif
xvmaddasp vs56, vs4,vs14
xvmaddasp vs60, vs4,vs15
.if \Complete==0
xxpermdi vs27, vs26, vs26,2
xxpermdi vs31, vs30, vs30,2
.endif
xvmaddasp vs33, vs5,vs8
xvmaddasp vs37, vs5,vs9
xvmaddasp vs41, vs5,vs10
xvmaddasp vs45, vs5,vs11
xvmaddasp vs49, vs5,vs12
xvmaddasp vs53, vs5,vs13
xvmaddasp vs57, vs5,vs14
xvmaddasp vs61, vs5,vs15
xvmaddasp vs34, vs6,vs8
xvmaddasp vs38, vs6,vs9
xvmaddasp vs42, vs6,vs10
xvmaddasp vs46, vs6,vs11
xvmaddasp vs50, vs6,vs12
xvmaddasp vs54, vs6,vs13
xvmaddasp vs58, vs6,vs14
xvmaddasp vs62, vs6,vs15
xvmaddasp vs35, vs7,vs8
xvmaddasp vs39, vs7,vs9
xvmaddasp vs43, vs7,vs10
xvmaddasp vs47, vs7,vs11
xvmaddasp vs51, vs7,vs12
xvmaddasp vs55, vs7,vs13
xvmaddasp vs59, vs7,vs14
xvmaddasp vs63, vs7,vs15
.endm

View File

@ -1353,7 +1353,7 @@ ZGEMM_L1:
ZGEMM_L1_BEGIN:
/*----------------------------------------*/
mr CO, C
slwi T1, LDC , 1
add T2,C,LDC
mr AO, A
add C, C, T1

View File

@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 832
#define DGEMM_DEFAULT_P 128
#define CGEMM_DEFAULT_P 640
#define CGEMM_DEFAULT_P 512
#define ZGEMM_DEFAULT_P 256
#define SGEMM_DEFAULT_Q 1025
#define SGEMM_DEFAULT_Q 1026
#define DGEMM_DEFAULT_Q 384
#define CGEMM_DEFAULT_Q 640
#define CGEMM_DEFAULT_Q 1026
#define ZGEMM_DEFAULT_Q 1026
#define SYMV_P 8