Merge pull request #2172 from quickwritereader/develop
power9 cgemm/ctrmm. new sgemm 8x16
This commit is contained in:
commit
6b6c9b1441
|
@ -5,7 +5,7 @@
|
||||||
|
|
||||||
STRMMKERNEL = sgemm_kernel_power9.S
|
STRMMKERNEL = sgemm_kernel_power9.S
|
||||||
DTRMMKERNEL = dgemm_kernel_power9.S
|
DTRMMKERNEL = dgemm_kernel_power9.S
|
||||||
CTRMMKERNEL = ctrmm_kernel_8x4_power8.S
|
CTRMMKERNEL = cgemm_kernel_power9.S
|
||||||
ZTRMMKERNEL = zgemm_kernel_power9.S
|
ZTRMMKERNEL = zgemm_kernel_power9.S
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_power9.S
|
SGEMMKERNEL = sgemm_kernel_power9.S
|
||||||
|
@ -28,9 +28,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CGEMMKERNEL = cgemm_kernel_8x4_power8.S
|
CGEMMKERNEL = cgemm_kernel_power9.S
|
||||||
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
|
||||||
CGEMMITCOPY = cgemm_tcopy_8_power8.S
|
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
|
@ -0,0 +1,293 @@
|
||||||
|
/***************************************************************************
|
||||||
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written permission.
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
*****************************************************************************/
|
||||||
|
|
||||||
|
/**************************************************************************************
|
||||||
|
* Abdelrauf(quickwritereader@gmail.com)
|
||||||
|
* BLASTEST : OK
|
||||||
|
* CTEST : OK
|
||||||
|
* TEST : OK
|
||||||
|
* LAPACK-TEST : OK
|
||||||
|
**************************************************************************************/
|
||||||
|
#define ASSEMBLER
|
||||||
|
#include "common.h"
|
||||||
|
#include "def_vsx.h"
|
||||||
|
|
||||||
|
|
||||||
|
#define LOAD ld
|
||||||
|
#define STACKSIZE (512 )
|
||||||
|
#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */
|
||||||
|
#define M r3
|
||||||
|
#define N r4
|
||||||
|
#define K r5
|
||||||
|
|
||||||
|
|
||||||
|
#define A r8
|
||||||
|
#define B r9
|
||||||
|
#define C r10
|
||||||
|
#define LDC r6
|
||||||
|
#define OFFSET r7
|
||||||
|
|
||||||
|
|
||||||
|
#define alpha_r vs19
|
||||||
|
#define alpha_i vs20
|
||||||
|
#define save_permute_1 vs21
|
||||||
|
#define permute_mask vs22
|
||||||
|
#define o0 0
|
||||||
|
|
||||||
|
|
||||||
|
#define T1 r11
|
||||||
|
#define T2 r12
|
||||||
|
#define T3 r14
|
||||||
|
#define T4 r15
|
||||||
|
#define T5 r16
|
||||||
|
#define T6 r17
|
||||||
|
#define L r18
|
||||||
|
#define T7 r19
|
||||||
|
#define T8 r20
|
||||||
|
#define TEMP_REG r21
|
||||||
|
#define I r22
|
||||||
|
#define J r23
|
||||||
|
#define AO r24
|
||||||
|
#define BO r25
|
||||||
|
#define CO r26
|
||||||
|
#define T9 r27
|
||||||
|
#define T10 r28
|
||||||
|
#define PRE r29
|
||||||
|
|
||||||
|
#define T12 r30
|
||||||
|
#define T13 r31
|
||||||
|
|
||||||
|
#include "cgemm_macros_power9.S"
|
||||||
|
|
||||||
|
.equ perm_const1, 0x0405060700010203
|
||||||
|
.equ perm_const2, 0x0c0d0e0f08090a0b
|
||||||
|
.equ save_permute_12, 0x0c0d0e0f1c1d1e1f
|
||||||
|
.equ save_permute_11, 0x0405060714151617
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifndef NEEDPARAM
|
||||||
|
|
||||||
|
PROLOGUE
|
||||||
|
PROFCODE
|
||||||
|
|
||||||
|
|
||||||
|
addi SP, SP, -STACKSIZE
|
||||||
|
mflr r0
|
||||||
|
|
||||||
|
|
||||||
|
stfd f14, 0(SP)
|
||||||
|
stfd f15, 8(SP)
|
||||||
|
stfd f16, 16(SP)
|
||||||
|
stfd f17, 24(SP)
|
||||||
|
|
||||||
|
stfd f18, 32(SP)
|
||||||
|
stfd f19, 40(SP)
|
||||||
|
stfd f20, 48(SP)
|
||||||
|
stfd f21, 56(SP)
|
||||||
|
|
||||||
|
stfd f22, 64(SP)
|
||||||
|
stfd f23, 72(SP)
|
||||||
|
stfd f24, 80(SP)
|
||||||
|
stfd f25, 88(SP)
|
||||||
|
|
||||||
|
stfd f26, 96(SP)
|
||||||
|
stfd f27, 104(SP)
|
||||||
|
stfd f28, 112(SP)
|
||||||
|
stfd f29, 120(SP)
|
||||||
|
|
||||||
|
stfd f30, 128(SP)
|
||||||
|
stfd f31, 136(SP)
|
||||||
|
|
||||||
|
|
||||||
|
std r31, 144(SP)
|
||||||
|
std r30, 152(SP)
|
||||||
|
std r29, 160(SP)
|
||||||
|
std r28, 168(SP)
|
||||||
|
std r27, 176(SP)
|
||||||
|
std r26, 184(SP)
|
||||||
|
std r25, 192(SP)
|
||||||
|
std r24, 200(SP)
|
||||||
|
std r23, 208(SP)
|
||||||
|
std r22, 216(SP)
|
||||||
|
std r21, 224(SP)
|
||||||
|
std r20, 232(SP)
|
||||||
|
std r19, 240(SP)
|
||||||
|
std r18, 248(SP)
|
||||||
|
std r17, 256(SP)
|
||||||
|
std r16, 264(SP)
|
||||||
|
std r15, 272(SP)
|
||||||
|
std r14, 280(SP)
|
||||||
|
|
||||||
|
|
||||||
|
stxv vs52, 288(SP)
|
||||||
|
stxv vs53, 304(SP)
|
||||||
|
stxv vs54, 320(SP)
|
||||||
|
stxv vs55, 336(SP)
|
||||||
|
stxv vs56, 352(SP)
|
||||||
|
stxv vs57, 368(SP)
|
||||||
|
stxv vs58, 384(SP)
|
||||||
|
stxv vs59, 400(SP)
|
||||||
|
stxv vs60, 416(SP)
|
||||||
|
stxv vs61, 432(SP)
|
||||||
|
stxv vs62, 448(SP)
|
||||||
|
stxv vs63, 464(SP)
|
||||||
|
std r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
ld LDC, FRAMESLOT(0) + STACKSIZE(SP)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef TRMMKERNEL
|
||||||
|
ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP)
|
||||||
|
#endif
|
||||||
|
slwi LDC, LDC, ZBASE_SHIFT
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/*alpha is stored in f1. convert to single and splat*/
|
||||||
|
xscvdpspn alpha_r,vs1
|
||||||
|
xscvdpspn alpha_i,vs2
|
||||||
|
xxspltw alpha_r,alpha_r,0
|
||||||
|
xxspltw alpha_i,alpha_i,0
|
||||||
|
/*load reverse permute mask for big endian
|
||||||
|
uint128 = 0xc0d0e0f08090a0b0405060700010203
|
||||||
|
*/
|
||||||
|
|
||||||
|
lis T2, perm_const2@highest
|
||||||
|
lis T1, perm_const1@highest
|
||||||
|
lis T3, save_permute_12@highest
|
||||||
|
lis T4, save_permute_11@highest
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@higher
|
||||||
|
ori T1, T1, perm_const1@higher
|
||||||
|
ori T3, T3, save_permute_12@higher
|
||||||
|
ori T4, T4, save_permute_11@higher
|
||||||
|
|
||||||
|
|
||||||
|
rldicr T2, T2, 32, 31
|
||||||
|
rldicr T1, T1, 32, 31
|
||||||
|
rldicr T3, T3, 32, 31
|
||||||
|
rldicr T4, T4, 32, 31
|
||||||
|
|
||||||
|
oris T2, T2, perm_const2@h
|
||||||
|
oris T1, T1, perm_const1@h
|
||||||
|
oris T3, T3, save_permute_12@h
|
||||||
|
oris T4, T4, save_permute_11@h
|
||||||
|
|
||||||
|
|
||||||
|
ori T2, T2, perm_const2@l
|
||||||
|
ori T1, T1, perm_const1@l
|
||||||
|
ori T3, T3, save_permute_12@l
|
||||||
|
ori T4, T4, save_permute_11@l
|
||||||
|
|
||||||
|
|
||||||
|
li r0,0
|
||||||
|
li PRE,512
|
||||||
|
|
||||||
|
#if defined(CC) || defined(CR) || defined(RC) || defined(RR)
|
||||||
|
/*negate for this case as we will use addition -1*(a+b) */
|
||||||
|
xvnegsp alpha_r,alpha_r
|
||||||
|
xvnegsp alpha_i,alpha_i
|
||||||
|
#endif
|
||||||
|
|
||||||
|
mtvsrdd permute_mask,T2,T1
|
||||||
|
mtvsrdd save_permute_1,T3,T4
|
||||||
|
|
||||||
|
/*mask is reverse permute so we have to make it inner permute */
|
||||||
|
xxpermdi permute_mask, permute_mask, permute_mask,2
|
||||||
|
|
||||||
|
#include "cgemm_logic_power9.S"
|
||||||
|
|
||||||
|
.L999:
|
||||||
|
lfd f14, 0(SP)
|
||||||
|
lfd f15, 8(SP)
|
||||||
|
lfd f16, 16(SP)
|
||||||
|
lfd f17, 24(SP)
|
||||||
|
|
||||||
|
lfd f18, 32(SP)
|
||||||
|
lfd f19, 40(SP)
|
||||||
|
lfd f20, 48(SP)
|
||||||
|
lfd f21, 56(SP)
|
||||||
|
|
||||||
|
lfd f22, 64(SP)
|
||||||
|
lfd f23, 72(SP)
|
||||||
|
lfd f24, 80(SP)
|
||||||
|
lfd f25, 88(SP)
|
||||||
|
|
||||||
|
lfd f26, 96(SP)
|
||||||
|
lfd f27, 104(SP)
|
||||||
|
lfd f28, 112(SP)
|
||||||
|
lfd f29, 120(SP)
|
||||||
|
|
||||||
|
lfd f30, 128(SP)
|
||||||
|
lfd f31, 136(SP)
|
||||||
|
|
||||||
|
ld r31, 144(SP)
|
||||||
|
ld r30, 152(SP)
|
||||||
|
ld r29, 160(SP)
|
||||||
|
ld r28, 168(SP)
|
||||||
|
ld r27, 176(SP)
|
||||||
|
ld r26, 184(SP)
|
||||||
|
ld r25, 192(SP)
|
||||||
|
ld r24, 200(SP)
|
||||||
|
ld r23, 208(SP)
|
||||||
|
ld r22, 216(SP)
|
||||||
|
ld r21, 224(SP)
|
||||||
|
ld r20, 232(SP)
|
||||||
|
ld r19, 240(SP)
|
||||||
|
ld r18, 248(SP)
|
||||||
|
ld r17, 256(SP)
|
||||||
|
ld r16, 264(SP)
|
||||||
|
ld r15, 272(SP)
|
||||||
|
ld r14, 280(SP)
|
||||||
|
|
||||||
|
ld r0, FLINK_SAVE(SP)
|
||||||
|
|
||||||
|
lxv vs52, 288(SP)
|
||||||
|
lxv vs53, 304(SP)
|
||||||
|
lxv vs54, 320(SP)
|
||||||
|
lxv vs55, 336(SP)
|
||||||
|
lxv vs56, 352(SP)
|
||||||
|
lxv vs57, 368(SP)
|
||||||
|
lxv vs58, 384(SP)
|
||||||
|
lxv vs59, 400(SP)
|
||||||
|
mtlr r0
|
||||||
|
lxv vs60, 416(SP)
|
||||||
|
lxv vs61, 432(SP)
|
||||||
|
lxv vs62, 448(SP)
|
||||||
|
lxv vs63, 464(SP)
|
||||||
|
|
||||||
|
addi SP, SP, STACKSIZE
|
||||||
|
blr
|
||||||
|
|
||||||
|
|
||||||
|
EPILOGUE
|
||||||
|
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -3,89 +3,89 @@ b L8
|
||||||
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_LMAIN_SUB:
|
LSGEMM_L8x16_LMAIN_SUB:
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
mtctr L
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
|
|
||||||
LSGEMM_L8x16_LOOP:
|
LSGEMM_L8x16_LOOP:
|
||||||
|
KERNEL8x16_L2 128,64,0,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
LSGEMM_L8x16_K128:
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
KERNEL8x16_L2 128,64,1,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
KERNEL8x16_I1_L4_2 128,64, 1,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
KERNEL8x16_I1_L4_2 128,64, 2,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
KERNEL8x16_I1_L4_2 128,64, 3,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
KERNEL8x16_I1_L4_2 128,64, 4,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
KERNEL8x16_I1_L4_2 128,64, 5,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 7,0
|
KERNEL8x16_I1_L4_2 128,64, 6,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 8,0
|
KERNEL8x16_I1_L4_2 128,64, 7,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 9,0
|
KERNEL8x16_I1_L4_2 128,64, 8,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 10,0
|
KERNEL8x16_I1_L4_2 128,64, 9,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 11,0
|
KERNEL8x16_I1_L4_2 128,64, 10,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 12,0
|
KERNEL8x16_I1_L4_2 128,64, 11,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 13,0
|
KERNEL8x16_I1_L4_2 128,64, 12,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 14,0
|
KERNEL8x16_I1_L4_2 128,64, 13,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 15,0
|
KERNEL8x16_I1_L4_2 128,64, 14,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 16,0
|
KERNEL8x16_I1_L4_2 128,64, 15,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 17,0
|
KERNEL8x16_I1_L4_2 128,64, 16,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 18,0
|
KERNEL8x16_I1_L4_2 128,64, 17,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 19,0
|
KERNEL8x16_I1_L4_2 128,64, 18,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 20,0
|
KERNEL8x16_I1_L4_2 128,64, 19,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 21,0
|
KERNEL8x16_I1_L4_2 128,64, 20,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 22,0
|
KERNEL8x16_I1_L4_2 128,64, 21,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 23,0
|
KERNEL8x16_I1_L4_2 128,64, 22,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 24,0
|
KERNEL8x16_I1_L4_2 128,64, 23,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 25,0
|
KERNEL8x16_I1_L4_2 128,64, 24,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 26,0
|
KERNEL8x16_I1_L4_2 128,64, 25,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 27,0
|
KERNEL8x16_I1_L4_2 128,64, 26,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 28,0
|
KERNEL8x16_I1_L4_2 128,64, 27,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 29,0
|
KERNEL8x16_I1_L4_2 128,64, 28,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 30,0
|
KERNEL8x16_I1_L4_2 128,64, 29,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 31,1
|
KERNEL8x16_I1_L4_2 128,64, 30,0
|
||||||
|
KERNEL8x16_I1_L4_2 128,64, 31,1
|
||||||
bdnz LSGEMM_L8x16_LOOP
|
bdnz LSGEMM_L8x16_LOOP
|
||||||
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_LOOP_END:
|
LSGEMM_L8x16_LOOP_END:
|
||||||
END8x16 0, AO, BO, 64, 32
|
END8x16_2
|
||||||
blr
|
blr
|
||||||
|
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_L64_SUB:
|
LSGEMM_L8x16_L64_SUB:
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
KERNEL8x16_I1_L4_2 128,64, 0,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
KERNEL8x16_I1_L4_2 128,64, 1,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
KERNEL8x16_I1_L4_2 128,64, 2,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
KERNEL8x16_I1_L4_2 128,64,3,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
KERNEL8x16_I1_L4_2 128,64,4,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
KERNEL8x16_I1_L4_2 128,64,5,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
KERNEL8x16_I1_L4_2 128,64,6,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 7,0
|
KERNEL8x16_I1_L4_2 128,64,7,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 8,0
|
KERNEL8x16_I1_L4_2 128,64,8,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 9,0
|
KERNEL8x16_I1_L4_2 128,64,9,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 10,0
|
KERNEL8x16_I1_L4_2 128,64,10,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 11,0
|
KERNEL8x16_I1_L4_2 128,64,11,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 12,0
|
KERNEL8x16_I1_L4_2 128,64,12,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 13,0
|
KERNEL8x16_I1_L4_2 128,64,13,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 14,0
|
KERNEL8x16_I1_L4_2 128,64,14,0
|
||||||
KERNEL8x16_I1_L4_3 64,32, 15,1
|
KERNEL8x16_I1_L4_3 128,64,15,1
|
||||||
blr
|
blr
|
||||||
LSGEMM_L8x16_L32_SUB:
|
LSGEMM_L8x16_L32_SUB:
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
KERNEL8x16_I1_L4_2 128,64,0,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
KERNEL8x16_I1_L4_2 128,64,1,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
KERNEL8x16_I1_L4_2 128,64,2,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 3,0
|
KERNEL8x16_I1_L4_2 128,64,3,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 4,0
|
KERNEL8x16_I1_L4_2 128,64,4,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 5,0
|
KERNEL8x16_I1_L4_2 128,64,5,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 6,0
|
KERNEL8x16_I1_L4_2 128,64,6,0
|
||||||
KERNEL8x16_I1_L4_3 64,32, 7,1
|
KERNEL8x16_I1_L4_3 128,64,7,1
|
||||||
blr
|
blr
|
||||||
|
|
||||||
LSGEMM_L8x16_L16_SUB:
|
LSGEMM_L8x16_L16_SUB:
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
KERNEL8x16_I1_L4_2 128,64,0,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 1,0
|
KERNEL8x16_I1_L4_2 128,64,1,0
|
||||||
KERNEL8x16_I1_L4_2 64,32, 2,0
|
KERNEL8x16_I1_L4_2 128,64,2,0
|
||||||
KERNEL8x16_I1_L4_3 64,32, 3,1
|
KERNEL8x16_I1_L4_3 128,64,3,1
|
||||||
blr
|
blr
|
||||||
|
|
||||||
L8:
|
L8:
|
||||||
|
@ -127,15 +127,16 @@ LSGEMM_L8x16_BEGIN:
|
||||||
#if defined(TRMMKERNEL)
|
#if defined(TRMMKERNEL)
|
||||||
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
|
REFRESH_TEMP_BK T11,K,TEMP_REG,16,8
|
||||||
mr T12, T11
|
mr T12, T11
|
||||||
addi T12,T12, -1
|
addi T12,T12, -2
|
||||||
srawi. L, T12, 7 /**(T11-1) % 128x */
|
srawi. L, T12, 7 /**(T11-2) % 128x */
|
||||||
#else
|
#else
|
||||||
mr T12, K
|
mr T12, K
|
||||||
addi T12,T12, -1
|
addi T12,T12, -2
|
||||||
srawi. L, T12, 7 /**(K-1) % 128x */
|
srawi. L, T12, 7 /**(K-2) % 128x */
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
ZERO8x16
|
ZERO8x16
|
||||||
|
mtctr L
|
||||||
ble LSGEMM_L8x16_SUB0
|
ble LSGEMM_L8x16_SUB0
|
||||||
bl LSGEMM_L8x16_LMAIN_SUB
|
bl LSGEMM_L8x16_LMAIN_SUB
|
||||||
andi. L, T12, 127
|
andi. L, T12, 127
|
||||||
|
@ -148,15 +149,33 @@ LSGEMM_L8x16_SUB0:
|
||||||
cmpwi T11,128
|
cmpwi T11,128
|
||||||
#else
|
#else
|
||||||
andi. L, K, 255
|
andi. L, K, 255
|
||||||
|
cmpwi K,129
|
||||||
|
#endif
|
||||||
|
li T10,1
|
||||||
|
bne CMP8x16_128K
|
||||||
|
addi BO,BO,-32
|
||||||
|
addi AO,AO,-64
|
||||||
|
LOAD8x16 64,32
|
||||||
|
END8x16_WITHOUT_ADD
|
||||||
|
LOAD8x16_2O AO,BO, 128, 64
|
||||||
|
mtctr T10
|
||||||
|
bl LSGEMM_L8x16_K128
|
||||||
|
b LSGEMM_L8x16_SAVE
|
||||||
|
CMP8x16_128K:
|
||||||
|
/*----------------------------------------*/
|
||||||
|
#if defined(TRMMKERNEL)
|
||||||
|
cmpwi T11,128
|
||||||
|
#else
|
||||||
cmpwi K,128
|
cmpwi K,128
|
||||||
#endif
|
#endif
|
||||||
|
bne LSGEMM_L8x16_SUB2
|
||||||
bne LSGEMM_L8x16_SUB2
|
MY_ALIGN
|
||||||
MY_ALIGN
|
mtctr T10
|
||||||
LSGEMM_L8x16_SUB2_128:
|
addi BO,BO,-64
|
||||||
bl LSGEMM_L8x16_L64_SUB
|
addi AO,AO,-128
|
||||||
bl LSGEMM_L8x16_L64_SUB
|
LOAD8x16_2O AO,BO, 128,64
|
||||||
b LSGEMM_L8x16_SAVE
|
bl LSGEMM_L8x16_K128
|
||||||
|
b LSGEMM_L8x16_SAVE
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2:
|
LSGEMM_L8x16_SUB2:
|
||||||
andi. T10,L,64
|
andi. T10,L,64
|
||||||
|
@ -176,21 +195,21 @@ LSGEMM_L8x16_SUB2_16:
|
||||||
LSGEMM_L8x16_SUB2_8:
|
LSGEMM_L8x16_SUB2_8:
|
||||||
andi. T10,L, 8
|
andi. T10,L, 8
|
||||||
ble LSGEMM_L8x16_SUB2_4
|
ble LSGEMM_L8x16_SUB2_4
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L4_2 64,32, 0,0
|
KERNEL8x16_I1_L4_2 128,64, 0,0
|
||||||
KERNEL8x16_I1_L4_3 64,32, 1,1
|
KERNEL8x16_I1_L4_3 128,64, 1,1
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_4:
|
LSGEMM_L8x16_SUB2_4:
|
||||||
andi. T10,L, 4
|
andi. T10,L, 4
|
||||||
ble LSGEMM_L8x16_SUB2_2
|
ble LSGEMM_L8x16_SUB2_2
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L4_3 64,32, 0,1
|
KERNEL8x16_I1_L4_3 128,64, 0,1
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_2:
|
LSGEMM_L8x16_SUB2_2:
|
||||||
andi. T10,L, 2
|
andi. T10,L, 2
|
||||||
ble LSGEMM_L8x16_SUB2_1
|
ble LSGEMM_L8x16_SUB2_1
|
||||||
LOAD8x16_0
|
LOAD8x16_2
|
||||||
KERNEL8x16_I1_L2_3 64,32, 0,1
|
KERNEL8x16_E2 128,64, 0,1
|
||||||
MY_ALIGN
|
MY_ALIGN
|
||||||
LSGEMM_L8x16_SUB2_1:
|
LSGEMM_L8x16_SUB2_1:
|
||||||
andi. T10,L, 1
|
andi. T10,L, 1
|
||||||
|
|
|
@ -38,13 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
* Macros for N=8 and M=16
|
* Macros for N=8 and M=16
|
||||||
**********************************************************************************************/
|
**********************************************************************************************/
|
||||||
|
|
||||||
.macro LOAD8x16_1
|
|
||||||
LOAD8x16 1
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro LOAD8x16_0
|
|
||||||
LOAD8x16 0
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL8x16_L1_L4 Index,IsLast
|
.macro KERNEL8x16_L1_L4 Index,IsLast
|
||||||
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
||||||
|
@ -61,10 +55,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
||||||
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
||||||
.endm
|
.endm
|
||||||
.macro KERNEL8x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
||||||
KERNEL8x16_L1_L2_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
||||||
.endm
|
|
||||||
|
|
||||||
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
||||||
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
||||||
.endm
|
.endm
|
||||||
|
@ -108,61 +99,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
xxlxor vs63, vs63, vs63
|
xxlxor vs63, vs63, vs63
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro LOAD8x16 Zero
|
.macro LOAD8x16 OffsetA,OffsetB
|
||||||
|
|
||||||
lxv vs24, 0(BO)
|
lxv vs24, (\OffsetB+0)(BO)
|
||||||
lxv vs28, 16(BO)
|
lxv vs28, (\OffsetB+16)(BO)
|
||||||
xxperm vs26, vs24, permute_mask
|
xxperm vs26, vs24, permute_mask
|
||||||
xxperm vs30, vs28, permute_mask
|
xxperm vs30, vs28, permute_mask
|
||||||
lxv vs0, 0(AO)
|
lxv vs0, (\OffsetA+0)(AO)
|
||||||
lxv vs1, 16(AO)
|
lxv vs1, (\OffsetA+16)(AO)
|
||||||
xxpermdi vs25, vs24, vs24,2
|
xxpermdi vs25, vs24, vs24,2
|
||||||
xxpermdi vs29, vs28, vs28,2
|
xxpermdi vs29, vs28, vs28,2
|
||||||
lxv vs2, 32(AO)
|
lxv vs2, (\OffsetA+32)(AO)
|
||||||
lxv vs3, 48(AO)
|
lxv vs3, (\OffsetA+48)(AO)
|
||||||
xxpermdi vs27, vs26, vs26,2
|
xxpermdi vs27, vs26, vs26,2
|
||||||
xxpermdi vs31, vs30, vs30,2
|
xxpermdi vs31, vs30, vs30,2
|
||||||
|
|
||||||
.if \Zero==1
|
|
||||||
xxlxor vs32, vs32, vs32
|
|
||||||
xxlxor vs33, vs33, vs33
|
|
||||||
xxlxor vs34, vs34, vs34
|
|
||||||
xxlxor vs35, vs35, vs35
|
|
||||||
xxlxor vs36, vs36, vs36
|
|
||||||
xxlxor vs37, vs37, vs37
|
|
||||||
xxlxor vs38, vs38, vs38
|
|
||||||
xxlxor vs39, vs39, vs39
|
|
||||||
xxlxor vs40, vs40, vs40
|
|
||||||
xxlxor vs41, vs41, vs41
|
|
||||||
xxlxor vs42, vs42, vs42
|
|
||||||
xxlxor vs43, vs43, vs43
|
|
||||||
xxlxor vs44, vs44, vs44
|
|
||||||
xxlxor vs45, vs45, vs45
|
|
||||||
xxlxor vs46, vs46, vs46
|
|
||||||
xxlxor vs47, vs47, vs47
|
|
||||||
xxlxor vs48, vs48, vs48
|
|
||||||
xxlxor vs49, vs49, vs49
|
|
||||||
xxlxor vs50, vs50, vs50
|
|
||||||
xxlxor vs51, vs51, vs51
|
|
||||||
xxlxor vs52, vs52, vs52
|
|
||||||
xxlxor vs53, vs53, vs53
|
|
||||||
xxlxor vs54, vs54, vs54
|
|
||||||
xxlxor vs55, vs55, vs55
|
|
||||||
xxlxor vs56, vs56, vs56
|
|
||||||
xxlxor vs57, vs57, vs57
|
|
||||||
xxlxor vs58, vs58, vs58
|
|
||||||
xxlxor vs59, vs59, vs59
|
|
||||||
xxlxor vs60, vs60, vs60
|
|
||||||
xxlxor vs61, vs61, vs61
|
|
||||||
xxlxor vs62, vs62, vs62
|
|
||||||
xxlxor vs63, vs63, vs63
|
|
||||||
.endif
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro END8x16_NORMAL
|
.macro END8x16_NORMAL
|
||||||
END8x16 0, AO, BO, 64,32
|
END8x16 0, AO, BO, 64,32
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
.macro END8x16_WITHOUT_ADD
|
||||||
|
END8x16 0, AO,BO,0,0
|
||||||
|
.endm
|
||||||
|
|
||||||
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
|
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
|
||||||
|
|
||||||
.if \OffsetB != 0
|
.if \OffsetB != 0
|
||||||
|
@ -258,145 +219,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
||||||
|
|
||||||
KERNEL8x16_L1_L2_I \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
|
KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
|
||||||
KERNEL8x16_L1_L2_I \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
|
KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x16 First
|
.macro KERNEL8x16 First
|
||||||
|
|
||||||
LOAD8x16 0
|
LOAD8x16 0,0
|
||||||
END8x16 \First, AO, BO, 64,32
|
END8x16 \First, AO, BO, 64,32
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro KERNEL8x16_L1_L2_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
.macro LOAD8x16_2
|
||||||
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
|
LOAD8x16_2O AO,BO, 0,0
|
||||||
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
|
.endm
|
||||||
|
|
||||||
|
.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
|
||||||
|
lxv vs8, (\OffsetB)(\BREG)
|
||||||
|
lxv vs12, (16+\OffsetB)(\BREG)
|
||||||
|
lxv vs24, (32+\OffsetB)(\BREG)
|
||||||
|
lxv vs28, (32+16+\OffsetB)(\BREG)
|
||||||
|
lxv vs4, (0+\OffsetA)(\AREG)
|
||||||
|
lxv vs5, (16+\OffsetA)(\AREG)
|
||||||
|
xxperm vs10, vs8, permute_mask
|
||||||
|
xxperm vs14, vs12, permute_mask
|
||||||
|
lxv vs6, (32+\OffsetA)(\AREG)
|
||||||
|
lxv vs7, (48+\OffsetA)(\AREG)
|
||||||
|
xxpermdi vs9, vs8, vs8,2
|
||||||
|
xxpermdi vs13, vs12, vs12,2
|
||||||
|
lxv vs0, (64+\OffsetA)(\AREG)
|
||||||
|
lxv vs1, (64+16+\OffsetA)(\AREG)
|
||||||
|
xxpermdi vs11, vs10, vs10,2
|
||||||
|
xxpermdi vs15, vs14, vs14,2
|
||||||
|
lxv vs2, (64+32+\OffsetA)(\AREG)
|
||||||
|
lxv vs3, (64+48+\OffsetA)(\AREG)
|
||||||
|
|
||||||
|
xxperm vs26, vs24, permute_mask
|
||||||
|
xxperm vs30, vs28, permute_mask
|
||||||
|
xxpermdi vs25, vs24, vs24,2
|
||||||
|
xxpermdi vs29, vs28, vs28,2
|
||||||
|
xxpermdi vs27, vs26, vs26,2
|
||||||
|
xxpermdi vs31, vs30, vs30,2
|
||||||
|
.endm
|
||||||
|
|
||||||
|
.macro END8x16_2
|
||||||
|
/*for load2 offset will be 128 and 64*/
|
||||||
|
KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
|
||||||
|
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
|
||||||
|
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
||||||
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
||||||
|
xvmaddasp vs32, vs4,vs8
|
||||||
|
xvmaddasp vs33, vs5,vs8
|
||||||
|
xvmaddasp vs48, vs4,vs12
|
||||||
|
xvmaddasp vs49, vs5,vs12
|
||||||
|
|
||||||
|
xvmaddasp vs40, vs4,vs10
|
||||||
|
xvmaddasp vs41, vs5,vs10
|
||||||
|
xvmaddasp vs56, vs4,vs14
|
||||||
|
xvmaddasp vs57, vs5,vs14
|
||||||
|
|
||||||
|
xvmaddasp vs36, vs4,vs9
|
||||||
|
xvmaddasp vs37, vs5,vs9
|
||||||
|
xvmaddasp vs52, vs4,vs13
|
||||||
|
xvmaddasp vs53, vs5,vs13
|
||||||
|
|
||||||
|
xvmaddasp vs44, vs4,vs11
|
||||||
|
xvmaddasp vs45, vs5,vs11
|
||||||
|
xvmaddasp vs60, vs4,vs15
|
||||||
|
xvmaddasp vs61, vs5,vs15
|
||||||
|
|
||||||
|
.if \Complete==0
|
||||||
|
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
|
||||||
|
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
||||||
|
.endif
|
||||||
|
|
||||||
|
xvmaddasp vs34, vs6,vs8
|
||||||
|
xvmaddasp vs35, vs7,vs8
|
||||||
|
xvmaddasp vs50, vs6,vs12
|
||||||
|
xvmaddasp vs51, vs7,vs12
|
||||||
|
.if \Complete==0
|
||||||
|
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
|
||||||
|
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs42, vs6,vs10
|
||||||
|
xvmaddasp vs43, vs7,vs10
|
||||||
|
xvmaddasp vs58, vs6,vs14
|
||||||
|
xvmaddasp vs59, vs7,vs14
|
||||||
|
.if \Complete==0
|
||||||
|
xxperm vs10, vs8, permute_mask
|
||||||
|
xxperm vs14, vs12, permute_mask
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs38, vs6,vs9
|
||||||
|
xvmaddasp vs39, vs7,vs9
|
||||||
|
xvmaddasp vs54, vs6,vs13
|
||||||
|
xvmaddasp vs55, vs7,vs13
|
||||||
|
.if \Complete==0
|
||||||
|
xxpermdi vs9, vs8, vs8,2
|
||||||
|
xxpermdi vs13, vs12, vs12,2
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs46, vs6,vs11
|
||||||
|
xvmaddasp vs47, vs7,vs11
|
||||||
|
xvmaddasp vs62, vs6,vs15
|
||||||
|
xvmaddasp vs63, vs7,vs15
|
||||||
|
.if \Complete==0
|
||||||
|
xxpermdi vs11, vs10, vs10,2
|
||||||
|
xxpermdi vs15, vs14, vs14,2
|
||||||
|
.endif
|
||||||
|
|
||||||
|
.if \Complete==0
|
||||||
|
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
|
||||||
|
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
|
||||||
|
.endif
|
||||||
|
|
||||||
xvmaddasp vs32, vs0,vs24
|
xvmaddasp vs32, vs0,vs24
|
||||||
xvmaddasp vs36, vs0,vs25
|
xvmaddasp vs33, vs1,vs24
|
||||||
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
|
|
||||||
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
||||||
xxperm vs10, vs8, permute_mask
|
|
||||||
xxperm vs14, vs12, permute_mask
|
|
||||||
xvmaddasp vs40, vs0,vs26
|
|
||||||
xvmaddasp vs44, vs0,vs27
|
|
||||||
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
||||||
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
|
|
||||||
xvmaddasp vs48, vs0,vs28
|
xvmaddasp vs48, vs0,vs28
|
||||||
xvmaddasp vs52, vs0,vs29
|
xvmaddasp vs49, vs1,vs28
|
||||||
|
xvmaddasp vs40, vs0,vs26
|
||||||
xxpermdi vs9, vs8, vs8,2
|
xvmaddasp vs41, vs1,vs26
|
||||||
xxpermdi vs13, vs12, vs12,2
|
|
||||||
|
|
||||||
xvmaddasp vs56, vs0,vs30
|
xvmaddasp vs56, vs0,vs30
|
||||||
|
xvmaddasp vs57, vs1,vs30
|
||||||
|
xvmaddasp vs36, vs0,vs25
|
||||||
|
xvmaddasp vs37, vs1,vs25
|
||||||
|
xvmaddasp vs52, vs0,vs29
|
||||||
|
xvmaddasp vs53, vs1,vs29
|
||||||
|
xvmaddasp vs44, vs0,vs27
|
||||||
|
xvmaddasp vs45, vs1,vs27
|
||||||
xvmaddasp vs60, vs0,vs31
|
xvmaddasp vs60, vs0,vs31
|
||||||
|
xvmaddasp vs61, vs1,vs31
|
||||||
xxpermdi vs11, vs10, vs10,2
|
|
||||||
xxpermdi vs15, vs14, vs14,2
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
xvmaddasp vs33, vs1,vs24
|
|
||||||
xvmaddasp vs37, vs1,vs25
|
|
||||||
|
|
||||||
xvmaddasp vs41, vs1,vs26
|
|
||||||
xvmaddasp vs45, vs1,vs27
|
|
||||||
xvmaddasp vs49, vs1,vs28
|
|
||||||
xvmaddasp vs53, vs1,vs29
|
|
||||||
xvmaddasp vs57, vs1,vs30
|
|
||||||
xvmaddasp vs61, vs1,vs31
|
|
||||||
.if \Complete==0
|
.if \Complete==0
|
||||||
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
|
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
|
||||||
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
||||||
.endif
|
.endif
|
||||||
xvmaddasp vs34, vs2,vs24
|
|
||||||
xvmaddasp vs38, vs2,vs25
|
|
||||||
xvmaddasp vs42, vs2,vs26
|
|
||||||
xvmaddasp vs46, vs2,vs27
|
|
||||||
xvmaddasp vs50, vs2,vs28
|
|
||||||
xvmaddasp vs54, vs2,vs29
|
|
||||||
xvmaddasp vs58, vs2,vs30
|
|
||||||
xvmaddasp vs62, vs2,vs31
|
|
||||||
|
|
||||||
xvmaddasp vs35, vs3,vs24
|
xvmaddasp vs34, vs2,vs24
|
||||||
xvmaddasp vs39, vs3,vs25
|
xvmaddasp vs35, vs3,vs24
|
||||||
xvmaddasp vs43, vs3,vs26
|
xvmaddasp vs50, vs2,vs28
|
||||||
xvmaddasp vs47, vs3,vs27
|
xvmaddasp vs51, vs3,vs28
|
||||||
xvmaddasp vs51, vs3,vs28
|
|
||||||
xvmaddasp vs55, vs3,vs29
|
|
||||||
xvmaddasp vs59, vs3,vs30
|
|
||||||
xvmaddasp vs63, vs3,vs31
|
|
||||||
.if \Complete==0
|
|
||||||
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
|
|
||||||
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
|
|
||||||
.endif
|
|
||||||
xvmaddasp vs32, vs4,vs8
|
|
||||||
xvmaddasp vs36, vs4,vs9
|
|
||||||
.if \Complete==0
|
.if \Complete==0
|
||||||
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
|
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
|
||||||
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
|
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs42, vs2,vs26
|
||||||
|
xvmaddasp vs43, vs3,vs26
|
||||||
|
xvmaddasp vs58, vs2,vs30
|
||||||
|
xvmaddasp vs59, vs3,vs30
|
||||||
|
.if \Complete==0
|
||||||
|
xxperm vs26, vs24, permute_mask
|
||||||
|
xxperm vs30, vs28, permute_mask
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs38, vs2,vs25
|
||||||
|
xvmaddasp vs39, vs3,vs25
|
||||||
|
xvmaddasp vs54, vs2,vs29
|
||||||
|
xvmaddasp vs55, vs3,vs29
|
||||||
|
.if \Complete==0
|
||||||
|
xxpermdi vs25, vs24, vs24,2
|
||||||
|
xxpermdi vs29, vs28, vs28,2
|
||||||
|
.endif
|
||||||
|
xvmaddasp vs46, vs2,vs27
|
||||||
|
xvmaddasp vs47, vs3,vs27
|
||||||
|
xvmaddasp vs62, vs2,vs31
|
||||||
|
xvmaddasp vs63, vs3,vs31
|
||||||
|
.if \Complete==0
|
||||||
|
xxpermdi vs27, vs26, vs26,2
|
||||||
|
xxpermdi vs31, vs30, vs30,2
|
||||||
.endif
|
.endif
|
||||||
|
.if \Complete==0
|
||||||
|
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
|
||||||
|
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
|
||||||
|
.endif
|
||||||
|
|
||||||
|
|
||||||
.if \IsLast==1
|
.if \IsLast==1
|
||||||
.if \Complete==1
|
.if \Complete==1
|
||||||
addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
|
addi \BREG, \BREG, DISP16(\Index,\OffsetB)
|
||||||
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
|
addi \AREG, \AREG, DISP32(\Index,\OffsetA)
|
||||||
|
|
||||||
.else
|
.else
|
||||||
addi \AREG, \AREG, DISP32(\Index,128)
|
|
||||||
addi \BREG, \BREG, DISP16(\Index,64)
|
addi \BREG, \BREG, DISP16(\Index,64)
|
||||||
|
addi \AREG, \AREG, DISP32(\Index,128)
|
||||||
|
|
||||||
.endif
|
.endif
|
||||||
.endif
|
.endif
|
||||||
xvmaddasp vs40, vs4,vs10
|
|
||||||
xvmaddasp vs44, vs4,vs11
|
|
||||||
.if \Complete==0
|
|
||||||
xxperm vs26, vs24, permute_mask
|
|
||||||
xxperm vs30, vs28, permute_mask
|
|
||||||
.endif
|
|
||||||
xvmaddasp vs48, vs4,vs12
|
|
||||||
xvmaddasp vs52, vs4,vs13
|
|
||||||
.if \Complete==0
|
|
||||||
xxpermdi vs25, vs24, vs24,2
|
|
||||||
xxpermdi vs29, vs28, vs28,2
|
|
||||||
.endif
|
|
||||||
|
|
||||||
xvmaddasp vs56, vs4,vs14
|
|
||||||
xvmaddasp vs60, vs4,vs15
|
|
||||||
|
|
||||||
.if \Complete==0
|
|
||||||
xxpermdi vs27, vs26, vs26,2
|
|
||||||
xxpermdi vs31, vs30, vs30,2
|
|
||||||
|
|
||||||
.endif
|
|
||||||
|
|
||||||
xvmaddasp vs33, vs5,vs8
|
|
||||||
xvmaddasp vs37, vs5,vs9
|
|
||||||
xvmaddasp vs41, vs5,vs10
|
|
||||||
xvmaddasp vs45, vs5,vs11
|
|
||||||
xvmaddasp vs49, vs5,vs12
|
|
||||||
xvmaddasp vs53, vs5,vs13
|
|
||||||
xvmaddasp vs57, vs5,vs14
|
|
||||||
xvmaddasp vs61, vs5,vs15
|
|
||||||
|
|
||||||
xvmaddasp vs34, vs6,vs8
|
|
||||||
xvmaddasp vs38, vs6,vs9
|
|
||||||
xvmaddasp vs42, vs6,vs10
|
|
||||||
xvmaddasp vs46, vs6,vs11
|
|
||||||
xvmaddasp vs50, vs6,vs12
|
|
||||||
xvmaddasp vs54, vs6,vs13
|
|
||||||
xvmaddasp vs58, vs6,vs14
|
|
||||||
xvmaddasp vs62, vs6,vs15
|
|
||||||
|
|
||||||
xvmaddasp vs35, vs7,vs8
|
|
||||||
xvmaddasp vs39, vs7,vs9
|
|
||||||
xvmaddasp vs43, vs7,vs10
|
|
||||||
xvmaddasp vs47, vs7,vs11
|
|
||||||
xvmaddasp vs51, vs7,vs12
|
|
||||||
xvmaddasp vs55, vs7,vs13
|
|
||||||
xvmaddasp vs59, vs7,vs14
|
|
||||||
xvmaddasp vs63, vs7,vs15
|
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -1353,7 +1353,7 @@ ZGEMM_L1:
|
||||||
ZGEMM_L1_BEGIN:
|
ZGEMM_L1_BEGIN:
|
||||||
/*----------------------------------------*/
|
/*----------------------------------------*/
|
||||||
mr CO, C
|
mr CO, C
|
||||||
slwi T1, LDC , 1
|
|
||||||
add T2,C,LDC
|
add T2,C,LDC
|
||||||
mr AO, A
|
mr AO, A
|
||||||
add C, C, T1
|
add C, C, T1
|
||||||
|
|
6
param.h
6
param.h
|
@ -2250,12 +2250,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 832
|
#define SGEMM_DEFAULT_P 832
|
||||||
#define DGEMM_DEFAULT_P 128
|
#define DGEMM_DEFAULT_P 128
|
||||||
#define CGEMM_DEFAULT_P 640
|
#define CGEMM_DEFAULT_P 512
|
||||||
#define ZGEMM_DEFAULT_P 256
|
#define ZGEMM_DEFAULT_P 256
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 1025
|
#define SGEMM_DEFAULT_Q 1026
|
||||||
#define DGEMM_DEFAULT_Q 384
|
#define DGEMM_DEFAULT_Q 384
|
||||||
#define CGEMM_DEFAULT_Q 640
|
#define CGEMM_DEFAULT_Q 1026
|
||||||
#define ZGEMM_DEFAULT_Q 1026
|
#define ZGEMM_DEFAULT_Q 1026
|
||||||
|
|
||||||
#define SYMV_P 8
|
#define SYMV_P 8
|
||||||
|
|
Loading…
Reference in New Issue