From 2c0a008281c99369499aebbf8961e66205061fb7 Mon Sep 17 00:00:00 2001 From: the mslm Date: Sun, 18 Feb 2018 04:16:38 +0400 Subject: [PATCH] dgemm_ncopy_4_ save/restore --- kernel/power/dgemm_ncopy_4_power8.S | 175 ++++----------- kernel/power/dgemm_ncopy_macros_4_power8.S | 238 +++++++++++---------- 2 files changed, 162 insertions(+), 251 deletions(-) diff --git a/kernel/power/dgemm_ncopy_4_power8.S b/kernel/power/dgemm_ncopy_4_power8.S index e0936574d..fa2ed349e 100644 --- a/kernel/power/dgemm_ncopy_4_power8.S +++ b/kernel/power/dgemm_ncopy_4_power8.S @@ -109,81 +109,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemm_ncopy_macros_4_power8.S" -#define STACKSIZE 384 -#define STACKSIZE 576 +#define STACKSIZE 144 + PROLOGUE PROFCODE addi SP, SP, -STACKSIZE -//addi SP, SP, -208 + li r0, 0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - stfd f30, 128(SP) - stfd f31, 136(SP) + std r14, 0(SP) + std r15, 8(SP) + std r16, 16(SP) + std r17, 24(SP) + std r18, 32(SP) + std r19, 40(SP) + std r20, 48(SP) + std r21, 56(SP) + std r22, 64(SP) + std r23, 72(SP) + std r24, 80(SP) + std r25, 88(SP) + std r26, 96(SP) + std r27, 104(SP) + std r28, 112(SP) + std r29, 120(SP) + std r30, 128(SP) + std r31, 136(SP) - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - -addi r11,SP,288 - stvx v20, r11,r0 -addi r11,r11,16 - stvx v21, r11,r0 -addi r11,r11,16 - stvx v22, r11,r0 -addi r11,r11,16 - stvx v23, r11,r0 -addi r11,r11,16 - stvx v24, r11,r0 -addi r11,r11,16 - stvx v25, r11,r0 -addi r11,r11,16 - stvx v26, r11,r0 -addi r11,r11,16 - stvx v27, r11,r0 -addi r11,r11,16 - stvx v28, r11,r0 -addi r11,r11,16 - stvx v29, r11,r0 -addi r11,r11,16 - stvx v30, r11,r0 -addi r11,r11,16 - stvx v31, r11,r0 -li r11,0 - cmpwi cr0, M, 0 ble- L999 cmpwi cr0, N, 0 @@ -191,10 +146,8 @@ li r11,0 slwi LDA, LDA, BASE_SHIFT - //li PREA, 384 - //li PREB, 384 - li PREA, 576 - li PREB, 576 + li PREA, 384 + li PREB, 384 li o8, 8 @@ -210,70 +163,24 @@ li r11,0 L999: - li r3, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) -addi r11,SP,288 - lvx v20, r11,r3 -addi r11,r11,16 - lvx v21, r11,r3 -addi r11,r11,16 - lvx v22, r11,r3 -addi r11,r11,16 - lvx v23, r11,r3 -addi r11,r11,16 - lvx v24, r11,r3 -addi r11,r11,16 - lvx v25, r11,r3 -addi r11,r11,16 - lvx v26, r11,r3 -addi r11,r11,16 - lvx v27, r11,r3 -addi r11,r11,16 - lvx v28, r11,r3 -addi r11,r11,16 - lvx v29, r11,r3 -addi r11,r11,16 - lvx v30, r11,r3 -addi r11,r11,16 - lvx v31, r11,r3 -li r11,0 + ld r14, 0(SP) + ld r15, 8(SP) + ld r16, 16(SP) + ld r17, 24(SP) + ld r18, 32(SP) + ld r19, 40(SP) + ld r20, 48(SP) + ld r21, 56(SP) + ld r22, 64(SP) + ld r23, 72(SP) + ld r24, 80(SP) + ld r25, 88(SP) + ld r26, 96(SP) + ld r27, 104(SP) + ld r28, 112(SP) + ld r29, 120(SP) + ld r30, 128(SP) + ld r31, 136(SP) addi SP, SP, STACKSIZE //addi SP, SP, 208 diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index fafb09877..8d6744b91 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -41,94 +41,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro COPY_4x16 lxvd2x vs0, o0, A0 - lxvd2x vs8, o0, A1 - lxvd2x vs24, o0, A3 - lxvd2x vs16, o0, A2 + lxvd2x vs1, o0, A1 + lxvd2x vs2, o0, A2 + lxvd2x vs3, o0, A3 - lxvd2x vs1, o16, A0 - lxvd2x vs9, o16, A1 - lxvd2x vs17, o16, A2 - lxvd2x vs25, o16, A3 + lxvd2x vs4, o16, A0 + lxvd2x vs5, o16, A1 + lxvd2x vs6, o16, A2 + lxvd2x vs7, o16, A3 - lxvd2x vs2, o32, A0 - lxvd2x vs10, o32, A1 - lxvd2x vs18, o32, A2 - lxvd2x vs26, o32, A3 + xxpermdi vs32, vs0, vs1, 0 + xxpermdi vs33, vs2, vs3, 0 + xxpermdi vs34, vs0, vs1, 3 + xxpermdi vs35, vs2, vs3, 3 - lxvd2x vs3, o48, A0 - lxvd2x vs11, o48, A1 - lxvd2x vs19, o48, A2 - lxvd2x vs27, o48, A3 + xxpermdi vs36, vs4, vs5, 0 + xxpermdi vs37, vs6, vs7, 0 + xxpermdi vs38, vs4, vs5, 3 + xxpermdi vs39, vs6, vs7, 3 - lxvd2x vs4, o64, A0 - lxvd2x vs12, o64, A1 - lxvd2x vs20, o64, A2 - lxvd2x vs28, o64, A3 + lxvd2x vs0, o32, A0 + lxvd2x vs1, o32, A1 + lxvd2x vs2, o32, A2 + lxvd2x vs3, o32, A3 - lxvd2x vs5, o80, A0 - lxvd2x vs13, o80, A1 - lxvd2x vs21, o80, A2 - lxvd2x vs29, o80, A3 - - lxvd2x vs6, o96, A0 - lxvd2x vs14, o96, A1 - lxvd2x vs22, o96, A2 - lxvd2x vs30, o96, A3 - - lxvd2x vs7, o112, A0 - lxvd2x vs15, o112, A1 - lxvd2x vs23, o112, A2 - lxvd2x vs31, o112, A3 + lxvd2x vs4, o48, A0 + lxvd2x vs5, o48, A1 + lxvd2x vs6, o48, A2 + lxvd2x vs7, o48, A3 - xxpermdi vs32, vs0, vs8, 0 - xxpermdi vs33, vs16, vs24, 0 - xxpermdi vs34, vs0, vs8, 3 - xxpermdi vs35, vs16, vs24, 3 + xxpermdi vs40, vs0, vs1, 0 + xxpermdi vs41, vs2, vs3, 0 + xxpermdi vs42, vs0, vs1, 3 + xxpermdi vs43, vs2, vs3, 3 - xxpermdi vs36, vs1, vs9, 0 - xxpermdi vs37, vs17, vs25, 0 - xxpermdi vs38, vs1, vs9, 3 - xxpermdi vs39, vs17, vs25, 3 + xxpermdi vs44, vs4, vs5, 0 + xxpermdi vs45, vs6, vs7, 0 + xxpermdi vs46, vs4, vs5, 3 + xxpermdi vs47, vs6, vs7, 3 - xxpermdi vs40, vs2, vs10, 0 - xxpermdi vs41, vs18, vs26, 0 - xxpermdi vs42, vs2, vs10, 3 - xxpermdi vs43, vs18, vs26, 3 + lxvd2x vs0, o64, A0 + lxvd2x vs1, o64, A1 + lxvd2x vs2, o64, A2 + lxvd2x vs3, o64, A3 - xxpermdi vs44, vs3, vs11, 0 - xxpermdi vs45, vs19, vs27, 0 - xxpermdi vs46, vs3, vs11, 3 - xxpermdi vs47, vs19, vs27, 3 + lxvd2x vs4, o80, A0 + lxvd2x vs5, o80, A1 + lxvd2x vs6, o80, A2 + lxvd2x vs7, o80, A3 - xxpermdi vs48, vs4, vs12, 0 - xxpermdi vs49, vs20, vs28, 0 - xxpermdi vs50, vs4, vs12, 3 - xxpermdi vs51, vs20, vs28, 3 - xxpermdi vs52, vs5, vs13, 0 - xxpermdi vs53, vs21, vs29, 0 - xxpermdi vs54, vs5, vs13, 3 - xxpermdi vs55, vs21, vs29, 3 + xxpermdi vs48, vs0, vs1, 0 + xxpermdi vs49, vs2, vs3, 0 + xxpermdi vs50, vs0, vs1, 3 + xxpermdi vs51, vs2, vs3, 3 + + xxpermdi vs8, vs4, vs5, 0 + xxpermdi vs9, vs6, vs7, 0 + xxpermdi vs10, vs4, vs5, 3 + xxpermdi vs11, vs6, vs7, 3 + + lxvd2x vs0, o96, A0 + lxvd2x vs1, o96, A1 + lxvd2x vs2, o96, A2 + lxvd2x vs3, o96, A3 + + + lxvd2x vs6, o112, A0 + lxvd2x vs7, o112, A1 + lxvd2x vs12, o112, A2 + lxvd2x vs13, o112, A3 + + + xxpermdi vs4, vs0, vs1, 0 + xxpermdi vs5, vs2, vs3, 0 + xxpermdi vs0, vs0, vs1, 3 + xxpermdi vs2, vs2, vs3, 3 + addi A0, A0, 128 addi A1, A1, 128 - xxpermdi vs56, vs6, vs14, 0 - xxpermdi vs57, vs22, vs30, 0 - xxpermdi vs58, vs6, vs14, 3 - xxpermdi vs59, vs22, vs30, 3 + xxpermdi vs1, vs6, vs7, 0 + xxpermdi vs3, vs12, vs13, 0 + xxpermdi vs6, vs6, vs7, 3 + xxpermdi vs12, vs12, vs13, 3 + + dcbt BO, PREB addi A3, A3, 128 addi A2, A2, 128 - xxpermdi vs60, vs7, vs15, 0 - xxpermdi vs61, vs23, vs31, 0 - xxpermdi vs62, vs7, vs15, 3 - xxpermdi vs63, vs23, vs31, 3 - - dcbt BO, PREB - stxvd2x vs32, o0, BO stxvd2x vs33, o16, BO stxvd2x vs34, o32, BO @@ -157,22 +161,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs49, o16, BO stxvd2x vs50, o32, BO stxvd2x vs51, o48, BO - stxvd2x vs52, o64, BO - stxvd2x vs53, o80, BO - stxvd2x vs54, o96, BO - stxvd2x vs55, o112, BO + stxvd2x vs8, o64, BO + stxvd2x vs9, o80, BO + stxvd2x vs10, o96, BO + stxvd2x vs11, o112, BO addi BO, BO, 128 dcbt BO, PREB - stxvd2x vs56, o0, BO - stxvd2x vs57, o16, BO - stxvd2x vs58, o32, BO - stxvd2x vs59, o48, BO - stxvd2x vs60, o64, BO - stxvd2x vs61, o80, BO - stxvd2x vs62, o96, BO - stxvd2x vs63, o112, BO + stxvd2x vs4, o0, BO + stxvd2x vs5, o16, BO + stxvd2x vs0, o32, BO + stxvd2x vs2, o48, BO + stxvd2x vs1, o64, BO + stxvd2x vs3, o80, BO + stxvd2x vs6, o96, BO + stxvd2x vs12, o112, BO addi BO, BO, 128 @@ -199,39 +203,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi A1, A1, 64 - lxvd2x vs16, o0, A2 - lxvd2x vs17, o16, A2 - lxvd2x vs18, o32, A2 - lxvd2x vs19, o48, A2 + lxvd2x vs4, o0, A2 + lxvd2x vs5, o16, A2 + lxvd2x vs6, o32, A2 + lxvd2x vs7, o48, A2 addi A2, A2, 64 - lxvd2x vs24, o0, A3 - lxvd2x vs25, o16, A3 - lxvd2x vs26, o32, A3 - lxvd2x vs27, o48, A3 + lxvd2x vs12, o0, A3 + lxvd2x vs13, o16, A3 + lxvd2x vs50, o32, A3 + lxvd2x vs51, o48, A3 addi A3, A3, 64 xxpermdi vs32, vs0, vs8, 0 - xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs33, vs4, vs12, 0 xxpermdi vs34, vs0, vs8, 3 - xxpermdi vs35, vs16, vs24, 3 + xxpermdi vs35, vs4, vs12, 3 xxpermdi vs36, vs1, vs9, 0 - xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs37, vs5, vs13, 0 xxpermdi vs38, vs1, vs9, 3 - xxpermdi vs39, vs17, vs25, 3 + xxpermdi vs39, vs5, vs13, 3 xxpermdi vs40, vs2, vs10, 0 - xxpermdi vs41, vs18, vs26, 0 + xxpermdi vs41, vs6, vs50, 0 xxpermdi vs42, vs2, vs10, 3 - xxpermdi vs43, vs18, vs26, 3 + xxpermdi vs43, vs6, vs50, 3 xxpermdi vs44, vs3, vs11, 0 - xxpermdi vs45, vs19, vs27, 0 + xxpermdi vs45, vs7, vs51, 0 xxpermdi vs46, vs3, vs11, 3 - xxpermdi vs47, vs19, vs27, 3 + xxpermdi vs47, vs7, vs51, 3 stxvd2x vs32, o0, BO @@ -274,25 +278,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi A1, A1, 32 - lxvd2x vs16, o0, A2 - lxvd2x vs17, o16, A2 + lxvd2x vs10, o0, A2 + lxvd2x vs11, o16, A2 addi A2, A2, 32 - lxvd2x vs24, o0, A3 - lxvd2x vs25, o16, A3 + lxvd2x vs12, o0, A3 + lxvd2x vs13, o16, A3 addi A3, A3, 32 xxpermdi vs32, vs0, vs8, 0 - xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs33, vs10, vs12, 0 xxpermdi vs34, vs0, vs8, 3 - xxpermdi vs35, vs16, vs24, 3 + xxpermdi vs35, vs10, vs12, 3 xxpermdi vs36, vs1, vs9, 0 - xxpermdi vs37, vs17, vs25, 0 + xxpermdi vs37, vs11, vs13, 0 xxpermdi vs38, vs1, vs9, 3 - xxpermdi vs39, vs17, vs25, 3 + xxpermdi vs39, vs11, vs13, 3 stxvd2x vs32, o0, BO @@ -323,18 +327,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi A1, A1, 16 - lxvd2x vs16, o0, A2 + lxvd2x vs9, o0, A2 addi A2, A2, 16 - lxvd2x vs24, o0, A3 + lxvd2x vs10, o0, A3 addi A3, A3, 16 xxpermdi vs32, vs0, vs8, 0 - xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs33, vs9, vs10, 0 xxpermdi vs34, vs0, vs8, 3 - xxpermdi vs35, vs16, vs24, 3 + xxpermdi vs35, vs9, vs10, 3 stxvd2x vs32, o0, BO @@ -361,16 +365,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi A1, A1, 8 - lxsdx vs16, o0, A2 + lxsdx vs9, o0, A2 addi A2, A2, 8 - lxsdx vs24, o0, A3 + lxsdx vs10, o0, A3 addi A3, A3, 8 xxpermdi vs32, vs0, vs8, 0 - xxpermdi vs33, vs16, vs24, 0 + xxpermdi vs33, vs9, vs10, 0 stxvd2x vs32, o0, BO @@ -404,8 +408,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs11, o48, A1 lxvd2x vs12, o64, A1 lxvd2x vs13, o80, A1 - lxvd2x vs14, o96, A1 - lxvd2x vs15, o112, A1 + lxvd2x vs48, o96, A1 + lxvd2x vs49, o112, A1 addi A1, A1, 128 @@ -427,11 +431,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxpermdi vs42, vs5, vs13, 0 xxpermdi vs43, vs5, vs13, 3 - xxpermdi vs44, vs6, vs14, 0 - xxpermdi vs45, vs6, vs14, 3 + xxpermdi vs44, vs6, vs48, 0 + xxpermdi vs45, vs6, vs48, 3 - xxpermdi vs46, vs7, vs15, 0 - xxpermdi vs47, vs7, vs15, 3 + xxpermdi vs46, vs7, vs49, 0 + xxpermdi vs47, vs7, vs49, 3 stxvd2x vs32, o0, BO