dgemm_ncopy_4_ save/restore

This commit is contained in:
the mslm 2018-02-18 04:16:38 +04:00 committed by Ubuntu
parent c5425daa6b
commit 2c0a008281
2 changed files with 162 additions and 251 deletions

View File

@ -109,81 +109,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dgemm_ncopy_macros_4_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
//addi SP, SP, -208
li r0, 0
stfd f14, 0(SP)
stfd f15, 8(SP)
stfd f16, 16(SP)
stfd f17, 24(SP)
stfd f18, 32(SP)
stfd f19, 40(SP)
stfd f20, 48(SP)
stfd f21, 56(SP)
stfd f22, 64(SP)
stfd f23, 72(SP)
stfd f24, 80(SP)
stfd f25, 88(SP)
stfd f26, 96(SP)
stfd f27, 104(SP)
stfd f28, 112(SP)
stfd f29, 120(SP)
stfd f30, 128(SP)
stfd f31, 136(SP)
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11,SP,288
stvx v20, r11,r0
addi r11,r11,16
stvx v21, r11,r0
addi r11,r11,16
stvx v22, r11,r0
addi r11,r11,16
stvx v23, r11,r0
addi r11,r11,16
stvx v24, r11,r0
addi r11,r11,16
stvx v25, r11,r0
addi r11,r11,16
stvx v26, r11,r0
addi r11,r11,16
stvx v27, r11,r0
addi r11,r11,16
stvx v28, r11,r0
addi r11,r11,16
stvx v29, r11,r0
addi r11,r11,16
stvx v30, r11,r0
addi r11,r11,16
stvx v31, r11,r0
li r11,0
cmpwi cr0, M, 0
ble- L999
cmpwi cr0, N, 0
@ -191,10 +146,8 @@ li r11,0
slwi LDA, LDA, BASE_SHIFT
//li PREA, 384
//li PREB, 384
li PREA, 576
li PREB, 576
li PREA, 384
li PREB, 384
li o8, 8
@ -210,70 +163,24 @@ li r11,0
L999:
li r3, 0
lfd f14, 0(SP)
lfd f15, 8(SP)
lfd f16, 16(SP)
lfd f17, 24(SP)
lfd f18, 32(SP)
lfd f19, 40(SP)
lfd f20, 48(SP)
lfd f21, 56(SP)
lfd f22, 64(SP)
lfd f23, 72(SP)
lfd f24, 80(SP)
lfd f25, 88(SP)
lfd f26, 96(SP)
lfd f27, 104(SP)
lfd f28, 112(SP)
lfd f29, 120(SP)
lfd f30, 128(SP)
lfd f31, 136(SP)
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11,r3
addi r11,r11,16
lvx v21, r11,r3
addi r11,r11,16
lvx v22, r11,r3
addi r11,r11,16
lvx v23, r11,r3
addi r11,r11,16
lvx v24, r11,r3
addi r11,r11,16
lvx v25, r11,r3
addi r11,r11,16
lvx v26, r11,r3
addi r11,r11,16
lvx v27, r11,r3
addi r11,r11,16
lvx v28, r11,r3
addi r11,r11,16
lvx v29, r11,r3
addi r11,r11,16
lvx v30, r11,r3
addi r11,r11,16
lvx v31, r11,r3
li r11,0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
//addi SP, SP, 208

View File

@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro COPY_4x16
lxvd2x vs0, o0, A0
lxvd2x vs8, o0, A1
lxvd2x vs24, o0, A3
lxvd2x vs16, o0, A2
lxvd2x vs1, o0, A1
lxvd2x vs2, o0, A2
lxvd2x vs3, o0, A3
lxvd2x vs1, o16, A0
lxvd2x vs9, o16, A1
lxvd2x vs17, o16, A2
lxvd2x vs25, o16, A3
lxvd2x vs4, o16, A0
lxvd2x vs5, o16, A1
lxvd2x vs6, o16, A2
lxvd2x vs7, o16, A3
lxvd2x vs2, o32, A0
lxvd2x vs10, o32, A1
lxvd2x vs18, o32, A2
lxvd2x vs26, o32, A3
xxpermdi vs32, vs0, vs1, 0
xxpermdi vs33, vs2, vs3, 0
xxpermdi vs34, vs0, vs1, 3
xxpermdi vs35, vs2, vs3, 3
lxvd2x vs3, o48, A0
lxvd2x vs11, o48, A1
lxvd2x vs19, o48, A2
lxvd2x vs27, o48, A3
xxpermdi vs36, vs4, vs5, 0
xxpermdi vs37, vs6, vs7, 0
xxpermdi vs38, vs4, vs5, 3
xxpermdi vs39, vs6, vs7, 3
lxvd2x vs4, o64, A0
lxvd2x vs12, o64, A1
lxvd2x vs20, o64, A2
lxvd2x vs28, o64, A3
lxvd2x vs0, o32, A0
lxvd2x vs1, o32, A1
lxvd2x vs2, o32, A2
lxvd2x vs3, o32, A3
lxvd2x vs5, o80, A0
lxvd2x vs13, o80, A1
lxvd2x vs21, o80, A2
lxvd2x vs29, o80, A3
lxvd2x vs6, o96, A0
lxvd2x vs14, o96, A1
lxvd2x vs22, o96, A2
lxvd2x vs30, o96, A3
lxvd2x vs7, o112, A0
lxvd2x vs15, o112, A1
lxvd2x vs23, o112, A2
lxvd2x vs31, o112, A3
lxvd2x vs4, o48, A0
lxvd2x vs5, o48, A1
lxvd2x vs6, o48, A2
lxvd2x vs7, o48, A3
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs40, vs0, vs1, 0
xxpermdi vs41, vs2, vs3, 0
xxpermdi vs42, vs0, vs1, 3
xxpermdi vs43, vs2, vs3, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs44, vs4, vs5, 0
xxpermdi vs45, vs6, vs7, 0
xxpermdi vs46, vs4, vs5, 3
xxpermdi vs47, vs6, vs7, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
lxvd2x vs0, o64, A0
lxvd2x vs1, o64, A1
lxvd2x vs2, o64, A2
lxvd2x vs3, o64, A3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
lxvd2x vs4, o80, A0
lxvd2x vs5, o80, A1
lxvd2x vs6, o80, A2
lxvd2x vs7, o80, A3
xxpermdi vs48, vs4, vs12, 0
xxpermdi vs49, vs20, vs28, 0
xxpermdi vs50, vs4, vs12, 3
xxpermdi vs51, vs20, vs28, 3
xxpermdi vs52, vs5, vs13, 0
xxpermdi vs53, vs21, vs29, 0
xxpermdi vs54, vs5, vs13, 3
xxpermdi vs55, vs21, vs29, 3
xxpermdi vs48, vs0, vs1, 0
xxpermdi vs49, vs2, vs3, 0
xxpermdi vs50, vs0, vs1, 3
xxpermdi vs51, vs2, vs3, 3
xxpermdi vs8, vs4, vs5, 0
xxpermdi vs9, vs6, vs7, 0
xxpermdi vs10, vs4, vs5, 3
xxpermdi vs11, vs6, vs7, 3
lxvd2x vs0, o96, A0
lxvd2x vs1, o96, A1
lxvd2x vs2, o96, A2
lxvd2x vs3, o96, A3
lxvd2x vs6, o112, A0
lxvd2x vs7, o112, A1
lxvd2x vs12, o112, A2
lxvd2x vs13, o112, A3
xxpermdi vs4, vs0, vs1, 0
xxpermdi vs5, vs2, vs3, 0
xxpermdi vs0, vs0, vs1, 3
xxpermdi vs2, vs2, vs3, 3
addi A0, A0, 128
addi A1, A1, 128
xxpermdi vs56, vs6, vs14, 0
xxpermdi vs57, vs22, vs30, 0
xxpermdi vs58, vs6, vs14, 3
xxpermdi vs59, vs22, vs30, 3
xxpermdi vs1, vs6, vs7, 0
xxpermdi vs3, vs12, vs13, 0
xxpermdi vs6, vs6, vs7, 3
xxpermdi vs12, vs12, vs13, 3
dcbt BO, PREB
addi A3, A3, 128
addi A2, A2, 128
xxpermdi vs60, vs7, vs15, 0
xxpermdi vs61, vs23, vs31, 0
xxpermdi vs62, vs7, vs15, 3
xxpermdi vs63, vs23, vs31, 3
dcbt BO, PREB
stxvd2x vs32, o0, BO
stxvd2x vs33, o16, BO
stxvd2x vs34, o32, BO
@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs49, o16, BO
stxvd2x vs50, o32, BO
stxvd2x vs51, o48, BO
stxvd2x vs52, o64, BO
stxvd2x vs53, o80, BO
stxvd2x vs54, o96, BO
stxvd2x vs55, o112, BO
stxvd2x vs8, o64, BO
stxvd2x vs9, o80, BO
stxvd2x vs10, o96, BO
stxvd2x vs11, o112, BO
addi BO, BO, 128
dcbt BO, PREB
stxvd2x vs56, o0, BO
stxvd2x vs57, o16, BO
stxvd2x vs58, o32, BO
stxvd2x vs59, o48, BO
stxvd2x vs60, o64, BO
stxvd2x vs61, o80, BO
stxvd2x vs62, o96, BO
stxvd2x vs63, o112, BO
stxvd2x vs4, o0, BO
stxvd2x vs5, o16, BO
stxvd2x vs0, o32, BO
stxvd2x vs2, o48, BO
stxvd2x vs1, o64, BO
stxvd2x vs3, o80, BO
stxvd2x vs6, o96, BO
stxvd2x vs12, o112, BO
addi BO, BO, 128
@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 64
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs18, o32, A2
lxvd2x vs19, o48, A2
lxvd2x vs4, o0, A2
lxvd2x vs5, o16, A2
lxvd2x vs6, o32, A2
lxvd2x vs7, o48, A2
addi A2, A2, 64
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs26, o32, A3
lxvd2x vs27, o48, A3
lxvd2x vs12, o0, A3
lxvd2x vs13, o16, A3
lxvd2x vs50, o32, A3
lxvd2x vs51, o48, A3
addi A3, A3, 64
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs4, vs12, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs4, vs12, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs37, vs5, vs13, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs39, vs5, vs13, 3
xxpermdi vs40, vs2, vs10, 0
xxpermdi vs41, vs18, vs26, 0
xxpermdi vs41, vs6, vs50, 0
xxpermdi vs42, vs2, vs10, 3
xxpermdi vs43, vs18, vs26, 3
xxpermdi vs43, vs6, vs50, 3
xxpermdi vs44, vs3, vs11, 0
xxpermdi vs45, vs19, vs27, 0
xxpermdi vs45, vs7, vs51, 0
xxpermdi vs46, vs3, vs11, 3
xxpermdi vs47, vs19, vs27, 3
xxpermdi vs47, vs7, vs51, 3
stxvd2x vs32, o0, BO
@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 32
lxvd2x vs16, o0, A2
lxvd2x vs17, o16, A2
lxvd2x vs10, o0, A2
lxvd2x vs11, o16, A2
addi A2, A2, 32
lxvd2x vs24, o0, A3
lxvd2x vs25, o16, A3
lxvd2x vs12, o0, A3
lxvd2x vs13, o16, A3
addi A3, A3, 32
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs10, vs12, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs10, vs12, 3
xxpermdi vs36, vs1, vs9, 0
xxpermdi vs37, vs17, vs25, 0
xxpermdi vs37, vs11, vs13, 0
xxpermdi vs38, vs1, vs9, 3
xxpermdi vs39, vs17, vs25, 3
xxpermdi vs39, vs11, vs13, 3
stxvd2x vs32, o0, BO
@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 16
lxvd2x vs16, o0, A2
lxvd2x vs9, o0, A2
addi A2, A2, 16
lxvd2x vs24, o0, A3
lxvd2x vs10, o0, A3
addi A3, A3, 16
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs9, vs10, 0
xxpermdi vs34, vs0, vs8, 3
xxpermdi vs35, vs16, vs24, 3
xxpermdi vs35, vs9, vs10, 3
stxvd2x vs32, o0, BO
@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi A1, A1, 8
lxsdx vs16, o0, A2
lxsdx vs9, o0, A2
addi A2, A2, 8
lxsdx vs24, o0, A3
lxsdx vs10, o0, A3
addi A3, A3, 8
xxpermdi vs32, vs0, vs8, 0
xxpermdi vs33, vs16, vs24, 0
xxpermdi vs33, vs9, vs10, 0
stxvd2x vs32, o0, BO
@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs11, o48, A1
lxvd2x vs12, o64, A1
lxvd2x vs13, o80, A1
lxvd2x vs14, o96, A1
lxvd2x vs15, o112, A1
lxvd2x vs48, o96, A1
lxvd2x vs49, o112, A1
addi A1, A1, 128
@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xxpermdi vs42, vs5, vs13, 0
xxpermdi vs43, vs5, vs13, 3
xxpermdi vs44, vs6, vs14, 0
xxpermdi vs45, vs6, vs14, 3
xxpermdi vs44, vs6, vs48, 0
xxpermdi vs45, vs6, vs48, 3
xxpermdi vs46, vs7, vs15, 0
xxpermdi vs47, vs7, vs15, 3
xxpermdi vs46, vs7, vs49, 0
xxpermdi vs47, vs7, vs49, 3
stxvd2x vs32, o0, BO