dgemm_ncopy_4_ save/restore

This commit is contained in:
the mslm 2018-02-18 04:16:38 +04:00 committed by Ubuntu
parent c5425daa6b
commit 2c0a008281
2 changed files with 162 additions and 251 deletions

View File

@ -109,81 +109,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dgemm_ncopy_macros_4_power8.S" #include "dgemm_ncopy_macros_4_power8.S"
#define STACKSIZE 384 #define STACKSIZE 144
#define STACKSIZE 576
PROLOGUE PROLOGUE
PROFCODE PROFCODE
addi SP, SP, -STACKSIZE addi SP, SP, -STACKSIZE
//addi SP, SP, -208
li r0, 0 li r0, 0
stfd f14, 0(SP) std r14, 0(SP)
stfd f15, 8(SP) std r15, 8(SP)
stfd f16, 16(SP) std r16, 16(SP)
stfd f17, 24(SP) std r17, 24(SP)
stfd f18, 32(SP) std r18, 32(SP)
stfd f19, 40(SP) std r19, 40(SP)
stfd f20, 48(SP) std r20, 48(SP)
stfd f21, 56(SP) std r21, 56(SP)
stfd f22, 64(SP) std r22, 64(SP)
stfd f23, 72(SP) std r23, 72(SP)
stfd f24, 80(SP) std r24, 80(SP)
stfd f25, 88(SP) std r25, 88(SP)
stfd f26, 96(SP) std r26, 96(SP)
stfd f27, 104(SP) std r27, 104(SP)
stfd f28, 112(SP) std r28, 112(SP)
stfd f29, 120(SP) std r29, 120(SP)
stfd f30, 128(SP) std r30, 128(SP)
stfd f31, 136(SP) std r31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11,SP,288
stvx v20, r11,r0
addi r11,r11,16
stvx v21, r11,r0
addi r11,r11,16
stvx v22, r11,r0
addi r11,r11,16
stvx v23, r11,r0
addi r11,r11,16
stvx v24, r11,r0
addi r11,r11,16
stvx v25, r11,r0
addi r11,r11,16
stvx v26, r11,r0
addi r11,r11,16
stvx v27, r11,r0
addi r11,r11,16
stvx v28, r11,r0
addi r11,r11,16
stvx v29, r11,r0
addi r11,r11,16
stvx v30, r11,r0
addi r11,r11,16
stvx v31, r11,r0
li r11,0
cmpwi cr0, M, 0 cmpwi cr0, M, 0
ble- L999 ble- L999
cmpwi cr0, N, 0 cmpwi cr0, N, 0
@ -191,10 +146,8 @@ li r11,0
slwi LDA, LDA, BASE_SHIFT slwi LDA, LDA, BASE_SHIFT
//li PREA, 384 li PREA, 384
//li PREB, 384 li PREB, 384
li PREA, 576
li PREB, 576
li o8, 8 li o8, 8
@ -210,70 +163,24 @@ li r11,0
L999: L999:
li r3, 0 ld r14, 0(SP)
ld r15, 8(SP)
lfd f14, 0(SP) ld r16, 16(SP)
lfd f15, 8(SP) ld r17, 24(SP)
lfd f16, 16(SP) ld r18, 32(SP)
lfd f17, 24(SP) ld r19, 40(SP)
lfd f18, 32(SP) ld r20, 48(SP)
lfd f19, 40(SP) ld r21, 56(SP)
lfd f20, 48(SP) ld r22, 64(SP)
lfd f21, 56(SP) ld r23, 72(SP)
lfd f22, 64(SP) ld r24, 80(SP)
lfd f23, 72(SP) ld r25, 88(SP)
lfd f24, 80(SP) ld r26, 96(SP)
lfd f25, 88(SP) ld r27, 104(SP)
lfd f26, 96(SP) ld r28, 112(SP)
lfd f27, 104(SP) ld r29, 120(SP)
lfd f28, 112(SP) ld r30, 128(SP)
lfd f29, 120(SP) ld r31, 136(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11,r3
addi r11,r11,16
lvx v21, r11,r3
addi r11,r11,16
lvx v22, r11,r3
addi r11,r11,16
lvx v23, r11,r3
addi r11,r11,16
lvx v24, r11,r3
addi r11,r11,16
lvx v25, r11,r3
addi r11,r11,16
lvx v26, r11,r3
addi r11,r11,16
lvx v27, r11,r3
addi r11,r11,16
lvx v28, r11,r3
addi r11,r11,16
lvx v29, r11,r3
addi r11,r11,16
lvx v30, r11,r3
addi r11,r11,16
lvx v31, r11,r3
li r11,0
addi SP, SP, STACKSIZE addi SP, SP, STACKSIZE
//addi SP, SP, 208 //addi SP, SP, 208

View File

@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_4x16 .macro COPY_4x16
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1 lxvd2x vs1, o0, A1
lxvd2x vs24, o0, A3 lxvd2x vs2, o0, A2
lxvd2x vs16, o0, A2 lxvd2x vs3, o0, A3
lxvd2x vs1, o16, A0 lxvd2x vs4, o16, A0
lxvd2x vs9, o16, A1 lxvd2x vs5, o16, A1
lxvd2x vs17, o16, A2 lxvd2x vs6, o16, A2
lxvd2x vs25, o16, A3 lxvd2x vs7, o16, A3
lxvd2x vs2, o32, A0 xxpermdi vs32, vs0, vs1, 0
lxvd2x vs10, o32, A1 xxpermdi vs33, vs2, vs3, 0
lxvd2x vs18, o32, A2 xxpermdi vs34, vs0, vs1, 3
lxvd2x vs26, o32, A3 xxpermdi vs35, vs2, vs3, 3
lxvd2x vs3, o48, A0 xxpermdi vs36, vs4, vs5, 0
lxvd2x vs11, o48, A1 xxpermdi vs37, vs6, vs7, 0
lxvd2x vs19, o48, A2 xxpermdi vs38, vs4, vs5, 3
lxvd2x vs27, o48, A3 xxpermdi vs39, vs6, vs7, 3
lxvd2x vs4, o64, A0 lxvd2x vs0, o32, A0
lxvd2x vs12, o64, A1 lxvd2x vs1, o32, A1
lxvd2x vs20, o64, A2 lxvd2x vs2, o32, A2
lxvd2x vs28, o64, A3 lxvd2x vs3, o32, A3
lxvd2x vs5, o80, A0 lxvd2x vs4, o48, A0
lxvd2x vs13, o80, A1 lxvd2x vs5, o48, A1
lxvd2x vs21, o80, A2 lxvd2x vs6, o48, A2
lxvd2x vs29, o80, A3 lxvd2x vs7, o48, A3
lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3
lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3
xxpermdi vs32, vs0, vs8, 0 xxpermdi vs40, vs0, vs1, 0
xxpermdi vs33, vs16, vs24, 0 xxpermdi vs41, vs2, vs3, 0
xxpermdi vs34, vs0, vs8, 3 xxpermdi vs42, vs0, vs1, 3
xxpermdi vs35, vs16, vs24, 3 xxpermdi vs43, vs2, vs3, 3
xxpermdi vs36, vs1, vs9, 0 xxpermdi vs44, vs4, vs5, 0
xxpermdi vs37, vs17, vs25, 0 xxpermdi vs45, vs6, vs7, 0
xxpermdi vs38, vs1, vs9, 3 xxpermdi vs46, vs4, vs5, 3
xxpermdi vs39, vs17, vs25, 3 xxpermdi vs47, vs6, vs7, 3
xxpermdi vs40, vs2, vs10, 0 lxvd2x vs0, o64, A0
xxpermdi vs41, vs18, vs26, 0 lxvd2x vs1, o64, A1
xxpermdi vs42, vs2, vs10, 3 lxvd2x vs2, o64, A2
xxpermdi vs43, vs18, vs26, 3 lxvd2x vs3, o64, A3
xxpermdi vs44, vs3, vs11, 0 lxvd2x vs4, o80, A0
xxpermdi vs45, vs19, vs27, 0 lxvd2x vs5, o80, A1
xxpermdi vs46, vs3, vs11, 3 lxvd2x vs6, o80, A2
xxpermdi vs47, vs19, vs27, 3 lxvd2x vs7, o80, A3
xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3
xxpermdi vs52, vs5, vs13, 0 xxpermdi vs48, vs0, vs1, 0
xxpermdi vs53, vs21, vs29, 0 xxpermdi vs49, vs2, vs3, 0
xxpermdi vs54, vs5, vs13, 3 xxpermdi vs50, vs0, vs1, 3
xxpermdi vs55, vs21, vs29, 3 xxpermdi vs51, vs2, vs3, 3
xxpermdi vs8, vs4, vs5, 0
xxpermdi vs9, vs6, vs7, 0
xxpermdi vs10, vs4, vs5, 3
xxpermdi vs11, vs6, vs7, 3
lxvd2x vs0, o96, A0
lxvd2x vs1, o96, A1
lxvd2x vs2, o96, A2
lxvd2x vs3, o96, A3
lxvd2x vs6, o112, A0
lxvd2x vs7, o112, A1
lxvd2x vs12, o112, A2
lxvd2x vs13, o112, A3
xxpermdi vs4, vs0, vs1, 0
xxpermdi vs5, vs2, vs3, 0
xxpermdi vs0, vs0, vs1, 3
xxpermdi vs2, vs2, vs3, 3
addi A0, A0, 128 addi A0, A0, 128
addi A1, A1, 128 addi A1, A1, 128
xxpermdi vs56, vs6, vs14, 0 xxpermdi vs1, vs6, vs7, 0
xxpermdi vs57, vs22, vs30, 0 xxpermdi vs3, vs12, vs13, 0
xxpermdi vs58, vs6, vs14, 3 xxpermdi vs6, vs6, vs7, 3
xxpermdi vs59, vs22, vs30, 3 xxpermdi vs12, vs12, vs13, 3
dcbt BO, PREB
addi A3, A3, 128 addi A3, A3, 128
addi A2, A2, 128 addi A2, A2, 128
xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3
dcbt BO, PREB
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO stxvd2x vs34, o32, BO
@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs49, o16, BO stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO stxvd2x vs8, o64, BO
stxvd2x vs53, o80, BO stxvd2x vs9, o80, BO
stxvd2x vs54, o96, BO stxvd2x vs10, o96, BO
stxvd2x vs55, o112, BO stxvd2x vs11, o112, BO
addi BO, BO, 128 addi BO, BO, 128
dcbt BO, PREB dcbt BO, PREB
stxvd2x vs56, o0, BO stxvd2x vs4, o0, BO
stxvd2x vs57, o16, BO stxvd2x vs5, o16, BO
stxvd2x vs58, o32, BO stxvd2x vs0, o32, BO
stxvd2x vs59, o48, BO stxvd2x vs2, o48, BO
stxvd2x vs60, o64, BO stxvd2x vs1, o64, BO
stxvd2x vs61, o80, BO stxvd2x vs3, o80, BO
stxvd2x vs62, o96, BO stxvd2x vs6, o96, BO
stxvd2x vs63, o112, BO stxvd2x vs12, o112, BO
addi BO, BO, 128 addi BO, BO, 128
@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 64 addi A1, A1, 64
lxvd2x vs16, o0, A2 lxvd2x vs4, o0, A2
lxvd2x vs17, o16, A2 lxvd2x vs5, o16, A2
lxvd2x vs18, o32, A2 lxvd2x vs6, o32, A2
lxvd2x vs19, o48, A2 lxvd2x vs7, o48, A2
addi A2, A2, 64 addi A2, A2, 64
lxvd2x vs24, o0, A3 lxvd2x vs12, o0, A3
lxvd2x vs25, o16, A3 lxvd2x vs13, o16, A3
lxvd2x vs26, o32, A3 lxvd2x vs50, o32, A3
lxvd2x vs27, o48, A3 lxvd2x vs51, o48, A3
addi A3, A3, 64 addi A3, A3, 64
xxpermdi vs32, vs0, vs8, 0 xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0 xxpermdi vs33, vs4, vs12, 0
xxpermdi vs34, vs0, vs8, 3 xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3 xxpermdi vs35, vs4, vs12, 3
xxpermdi vs36, vs1, vs9, 0 xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0 xxpermdi vs37, vs5, vs13, 0
xxpermdi vs38, vs1, vs9, 3 xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3 xxpermdi vs39, vs5, vs13, 3
xxpermdi vs40, vs2, vs10, 0 xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0 xxpermdi vs41, vs6, vs50, 0
xxpermdi vs42, vs2, vs10, 3 xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3 xxpermdi vs43, vs6, vs50, 3
xxpermdi vs44, vs3, vs11, 0 xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0 xxpermdi vs45, vs7, vs51, 0
xxpermdi vs46, vs3, vs11, 3 xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3 xxpermdi vs47, vs7, vs51, 3
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 32 addi A1, A1, 32
lxvd2x vs16, o0, A2 lxvd2x vs10, o0, A2
lxvd2x vs17, o16, A2 lxvd2x vs11, o16, A2
addi A2, A2, 32 addi A2, A2, 32
lxvd2x vs24, o0, A3 lxvd2x vs12, o0, A3
lxvd2x vs25, o16, A3 lxvd2x vs13, o16, A3
addi A3, A3, 32 addi A3, A3, 32
xxpermdi vs32, vs0, vs8, 0 xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0 xxpermdi vs33, vs10, vs12, 0
xxpermdi vs34, vs0, vs8, 3 xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3 xxpermdi vs35, vs10, vs12, 3
xxpermdi vs36, vs1, vs9, 0 xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0 xxpermdi vs37, vs11, vs13, 0
xxpermdi vs38, vs1, vs9, 3 xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3 xxpermdi vs39, vs11, vs13, 3
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 16 addi A1, A1, 16
lxvd2x vs16, o0, A2 lxvd2x vs9, o0, A2
addi A2, A2, 16 addi A2, A2, 16
lxvd2x vs24, o0, A3 lxvd2x vs10, o0, A3
addi A3, A3, 16 addi A3, A3, 16
xxpermdi vs32, vs0, vs8, 0 xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0 xxpermdi vs33, vs9, vs10, 0
xxpermdi vs34, vs0, vs8, 3 xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3 xxpermdi vs35, vs9, vs10, 3
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 8 addi A1, A1, 8
lxsdx vs16, o0, A2 lxsdx vs9, o0, A2
addi A2, A2, 8 addi A2, A2, 8
lxsdx vs24, o0, A3 lxsdx vs10, o0, A3
addi A3, A3, 8 addi A3, A3, 8
xxpermdi vs32, vs0, vs8, 0 xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0 xxpermdi vs33, vs9, vs10, 0
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO
@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs11, o48, A1 lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1 lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1 lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1 lxvd2x vs48, o96, A1
lxvd2x vs15, o112, A1 lxvd2x vs49, o112, A1
addi A1, A1, 128 addi A1, A1, 128
@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs42, vs5, vs13, 0 xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3 xxpermdi vs43, vs5, vs13, 3
xxpermdi vs44, vs6, vs14, 0 xxpermdi vs44, vs6, vs48, 0
xxpermdi vs45, vs6, vs14, 3 xxpermdi vs45, vs6, vs48, 3
xxpermdi vs46, vs7, vs15, 0 xxpermdi vs46, vs7, vs49, 0
xxpermdi vs47, vs7, vs15, 3 xxpermdi vs47, vs7, vs49, 3
stxvd2x vs32, o0, BO stxvd2x vs32, o0, BO