diff --git a/kernel/power/cgemm_tcopy_8_power8.S b/kernel/power/cgemm_tcopy_8_power8.S index 66a50584c..b4df3bc1b 100644 --- a/kernel/power/cgemm_tcopy_8_power8.S +++ b/kernel/power/cgemm_tcopy_8_power8.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cgemm_tcopy_macros_8_power8.S" -#define STACKSIZE 576 +#define STACKSIZE 144 PROLOGUE @@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi SP, SP, -STACKSIZE li r0, 0 - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - addi r11, SP, 288 - stvx v20, r11, r0 - addi r11, r11, 16 - stvx v21, r11, r0 - addi r11, r11, 16 - stvx v22, r11, r0 - addi r11, r11, 16 - stvx v23, r11, r0 - addi r11, r11, 16 - stvx v24, r11, r0 - addi r11, r11, 16 - stvx v25, r11, r0 - addi r11, r11, 16 - stvx v26, r11, r0 - addi r11, r11, 16 - stvx v27, r11, r0 - addi r11, r11, 16 - stvx v28, r11, r0 - addi r11, r11, 16 - stvx v29, r11, r0 - addi r11, r11, 16 - stvx v30, r11, r0 - addi r11, r11, 16 - stvx v31, r11, r0 - li r11, 0 + std r14, 0(SP) + std r15, 8(SP) + std r16, 16(SP) + std r17, 24(SP) + std r18, 32(SP) + std r19, 40(SP) + std r20, 48(SP) + std r21, 56(SP) + std r22, 64(SP) + std r23, 72(SP) + std r24, 80(SP) + std r25, 88(SP) + std r26, 96(SP) + std r27, 104(SP) + std r28, 112(SP) + std r29, 120(SP) + std r30, 128(SP) + std r31, 136(SP) cmpwi cr0, M, 0 ble- L999 @@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. L999: - li r3, 0 - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - addi r11, SP, 288 - lvx v20, r11, r3 - addi r11, r11, 16 - lvx v21, r11, r3 - addi r11, r11, 16 - lvx v22, r11, r3 - addi r11, r11, 16 - lvx v23, r11, r3 - addi r11, r11, 16 - lvx v24, r11, r3 - addi r11, r11, 16 - lvx v25, r11, r3 - addi r11, r11, 16 - lvx v26, r11, r3 - addi r11, r11, 16 - lvx v27, r11, r3 - addi r11, r11, 16 - lvx v28, r11, r3 - addi r11, r11, 16 - lvx v29, r11, r3 - addi r11, r11, 16 - lvx v30, r11, r3 - addi r11, r11, 16 - lvx v31, r11, r3 - li r11, 0 + ld r14, 0(SP) + ld r15, 8(SP) + ld r16, 16(SP) + ld r17, 24(SP) + ld r18, 32(SP) + ld r19, 40(SP) + ld r20, 48(SP) + ld r21, 56(SP) + ld r22, 64(SP) + ld r23, 72(SP) + ld r24, 80(SP) + ld r25, 88(SP) + ld r26, 96(SP) + ld r27, 104(SP) + ld r28, 112(SP) + ld r29, 120(SP) + ld r30, 128(SP) + ld r31, 136(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/dgemm_tcopy_16_power8.S b/kernel/power/dgemm_tcopy_16_power8.S index 6da816220..fc52e0202 100644 --- a/kernel/power/dgemm_tcopy_16_power8.S +++ b/kernel/power/dgemm_tcopy_16_power8.S @@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dgemm_tcopy_macros_16_power8.S" -#define STACKSIZE 384 -#define STACKSIZE 576 +#define STACKSIZE 144 + PROLOGUE PROFCODE addi SP, SP, -STACKSIZE -//addi SP, SP, -208 li r0, 0 - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) -addi r11,SP,288 - stvx v20, r11,r0 -addi r11,r11,16 - stvx v21, r11,r0 -addi r11,r11,16 - stvx v22, r11,r0 -addi r11,r11,16 - stvx v23, r11,r0 -addi r11,r11,16 - stvx v24, r11,r0 -addi r11,r11,16 - stvx v25, r11,r0 -addi r11,r11,16 - stvx v26, r11,r0 -addi r11,r11,16 - stvx v27, r11,r0 -addi r11,r11,16 - stvx v28, r11,r0 -addi r11,r11,16 - stvx v29, r11,r0 -addi r11,r11,16 - stvx v30, r11,r0 -addi r11,r11,16 - stvx v31, r11,r0 -li r11,0 + std r14,0(SP) + std r15,8(SP) + std r16,16(SP) + std r17,24(SP) + std r18,32(SP) + std r19,40(SP) + std r20,48(SP) + std r21,56(SP) + std r22,64(SP) + std r23,72(SP) + std r24,80(SP) + std r25,88(SP) + std r26,96(SP) + std r27,104(SP) + std r28,112(SP) + std r29,120(SP) + std r30,128(SP) + std r31,136(SP) cmpwi cr0, M, 0 ble- L999 @@ -198,8 +172,7 @@ li r11,0 add B2, B2, B add B1, B1, B - //li PREA, 384 - li PREA, 576 + li PREA, 384 addi PREB, M16, 128 li o8, 8 @@ -213,52 +186,27 @@ L999: li r3, 0 - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) -addi r11,SP,288 - lvx v20, r11,r3 -addi r11,r11,16 - lvx v21, r11,r3 -addi r11,r11,16 - lvx v22, r11,r3 -addi r11,r11,16 - lvx v23, r11,r3 -addi r11,r11,16 - lvx v24, r11,r3 -addi r11,r11,16 - lvx v25, r11,r3 -addi r11,r11,16 - lvx v26, r11,r3 -addi r11,r11,16 - lvx v27, r11,r3 -addi r11,r11,16 - lvx v28, r11,r3 -addi r11,r11,16 - lvx v29, r11,r3 -addi r11,r11,16 - lvx v30, r11,r3 -addi r11,r11,16 - lvx v31, r11,r3 -li r11,0 + ld r14,0(SP) + ld r15,8(SP) + ld r16,16(SP) + ld r17,24(SP) + ld r18,32(SP) + ld r19,40(SP) + ld r20,48(SP) + ld r21,56(SP) + ld r22,64(SP) + ld r23,72(SP) + ld r24,80(SP) + ld r25,88(SP) + ld r26,96(SP) + ld r27,104(SP) + ld r28,112(SP) + ld r29,120(SP) + ld r30,128(SP) + ld r31,136(SP) addi SP, SP, STACKSIZE -//addi SP, SP, 208 + blr EPILOGUE diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 333e23105..68e53bcf2 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs51, o48, A2 addi A2, A2, 64 - lxvd2x vs56, o0, A3 - lxvd2x vs57, o16, A3 - lxvd2x vs58, o32, A3 - lxvd2x vs59, o48, A3 + lxvd2x vs4, o0, A3 + lxvd2x vs5, o16, A3 + lxvd2x vs6, o32, A3 + lxvd2x vs7, o48, A3 addi A3, A3, 64 lxvd2x vs36, o0, A0 @@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs47, o48, A1 addi A1, A1, 64 - lxvd2x vs52, o0, A2 - lxvd2x vs53, o16, A2 - lxvd2x vs54, o32, A2 - lxvd2x vs55, o48, A2 + lxvd2x vs12, o0, A2 + lxvd2x vs13, o16, A2 + lxvd2x vs2, o32, A2 + lxvd2x vs3, o48, A2 addi A2, A2, 64 - lxvd2x vs60, o0, A3 - lxvd2x vs61, o16, A3 - lxvd2x vs62, o32, A3 - lxvd2x vs63, o48, A3 + lxvd2x vs8, o0, A3 + lxvd2x vs9, o16, A3 + lxvd2x vs10, o32, A3 + lxvd2x vs11, o48, A3 addi A3, A3, 64 mr T1, BO @@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs51, o48, T1 addi T1, T1, 64 - stxvd2x vs52, o0, T1 - stxvd2x vs53, o16, T1 - stxvd2x vs54, o32, T1 - stxvd2x vs55, o48, T1 + stxvd2x vs12, o0, T1 + stxvd2x vs13, o16, T1 + stxvd2x vs2, o32, T1 + stxvd2x vs3, o48, T1 addi T1, T1, 64 - stxvd2x vs56, o0, T1 - stxvd2x vs57, o16, T1 - stxvd2x vs58, o32, T1 - stxvd2x vs59, o48, T1 + stxvd2x vs4, o0, T1 + stxvd2x vs5, o16, T1 + stxvd2x vs6, o32, T1 + stxvd2x vs7, o48, T1 addi T1, T1, 64 - stxvd2x vs60, o0, T1 - stxvd2x vs61, o16, T1 - stxvd2x vs62, o32, T1 - stxvd2x vs63, o48, T1 + stxvd2x vs8, o0, T1 + stxvd2x vs9, o16, T1 + stxvd2x vs10, o32, T1 + stxvd2x vs11, o48, T1 .endm diff --git a/kernel/power/sgemm_tcopy_16_power8.S b/kernel/power/sgemm_tcopy_16_power8.S index 8f6b4d8c4..b9f6d63fb 100644 --- a/kernel/power/sgemm_tcopy_16_power8.S +++ b/kernel/power/sgemm_tcopy_16_power8.S @@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemm_tcopy_macros_16_power8.S" -#define STACKSIZE 576 +#define STACKSIZE 144 PROLOGUE PROFCODE @@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi SP, SP, -STACKSIZE li r0, 0 - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - addi r11 ,SP, 288 - stvx v20, r11, r0 - addi r11, r11, 16 - stvx v21, r11, r0 - addi r11, r11, 16 - stvx v22, r11, r0 - addi r11, r11, 16 - stvx v23, r11, r0 - addi r11, r11, 16 - stvx v24, r11, r0 - addi r11, r11, 16 - stvx v25, r11, r0 - addi r11, r11, 16 - stvx v26, r11, r0 - addi r11, r11, 16 - stvx v27, r11, r0 - addi r11, r11, 16 - stvx v28, r11, r0 - addi r11, r11, 16 - stvx v29, r11, r0 - addi r11, r11, 16 - stvx v30, r11, r0 - addi r11, r11, 16 - stvx v31, r11, r0 - li r11, 0 + std r14, 0(SP) + std r15, 8(SP) + std r16, 16(SP) + std r17, 24(SP) + std r18, 32(SP) + std r19, 40(SP) + std r20, 48(SP) + std r21, 56(SP) + std r22, 64(SP) + std r23, 72(SP) + std r24, 80(SP) + std r25, 88(SP) + std r26, 96(SP) + std r27, 104(SP) + std r28, 112(SP) + std r29, 120(SP) + std r30, 128(SP) + std r31, 136(SP) cmpwi cr0, M, 0 ble- L999 @@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. L999: - li r3, 0 - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - addi r11, SP, 288 - lvx v20, r11, r3 - addi r11, r11, 16 - lvx v21, r11, r3 - addi r11, r11, 16 - lvx v22, r11, r3 - addi r11, r11, 16 - lvx v23, r11, r3 - addi r11, r11, 16 - lvx v24, r11, r3 - addi r11, r11, 16 - lvx v25, r11, r3 - addi r11, r11, 16 - lvx v26, r11, r3 - addi r11, r11, 16 - lvx v27, r11, r3 - addi r11, r11, 16 - lvx v28, r11, r3 - addi r11, r11, 16 - lvx v29, r11, r3 - addi r11, r11, 16 - lvx v30, r11, r3 - addi r11, r11, 16 - lvx v31, r11, r3 - li r11, 0 + ld r14, 0(SP) + ld r15, 8(SP) + ld r16, 16(SP) + ld r17, 24(SP) + ld r18, 32(SP) + ld r19, 40(SP) + ld r20, 48(SP) + ld r21, 56(SP) + ld r22, 64(SP) + ld r23, 72(SP) + ld r24, 80(SP) + ld r25, 88(SP) + ld r26, 96(SP) + ld r27, 104(SP) + ld r28, 112(SP) + ld r29, 120(SP) + ld r30, 128(SP) + ld r31, 136(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/sgemm_tcopy_8_power8.S b/kernel/power/sgemm_tcopy_8_power8.S index 98185432a..2c18fb4b4 100644 --- a/kernel/power/sgemm_tcopy_8_power8.S +++ b/kernel/power/sgemm_tcopy_8_power8.S @@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "sgemm_tcopy_macros_8_power8.S" -#define STACKSIZE 576 +#define STACKSIZE 144 PROLOGUE PROFCODE addi SP, SP, -STACKSIZE li r0, 0 + std r14, 0(SP) + std r15, 8(SP) + std r16, 16(SP) + std r17, 24(SP) + std r18, 32(SP) + std r19, 40(SP) + std r20, 48(SP) + std r21, 56(SP) + std r22, 64(SP) + std r23, 72(SP) + std r24, 80(SP) + std r25, 88(SP) + std r26, 96(SP) + std r27, 104(SP) + std r28, 112(SP) + std r29, 120(SP) + std r30, 128(SP) + std r31, 136(SP) - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - addi r11, SP, 288 - stvx v20, r11, r0 - addi r11, r11, 16 - stvx v21, r11, r0 - addi r11, r11, 16 - stvx v22, r11, r0 - addi r11, r11, 16 - stvx v23, r11, r0 - addi r11, r11, 16 - stvx v24, r11, r0 - addi r11, r11, 16 - stvx v25, r11, r0 - addi r11, r11, 16 - stvx v26, r11, r0 - addi r11, r11, 16 - stvx v27, r11, r0 - addi r11, r11, 16 - stvx v28, r11, r0 - addi r11, r11, 16 - stvx v29, r11, r0 - addi r11, r11, 16 - stvx v30, r11, r0 - addi r11, r11, 16 - stvx v31, r11, r0 - li r11, 0 cmpwi cr0, M, 0 ble- L999 @@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. L999: - li r3, 0 - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - addi r11,SP,288 - lvx v20, r11, r3 - addi r11, r11, 16 - lvx v21, r11, r3 - addi r11, r11, 16 - lvx v22, r11, r3 - addi r11, r11, 16 - lvx v23, r11, r3 - addi r11, r11, 16 - lvx v24, r11, r3 - addi r11, r11, 16 - lvx v25, r11, r3 - addi r11, r11, 16 - lvx v26, r11, r3 - addi r11, r11, 16 - lvx v27, r11, r3 - addi r11, r11, 16 - lvx v28, r11, r3 - addi r11, r11, 16 - lvx v29, r11, r3 - addi r11, r11, 16 - lvx v30, r11, r3 - addi r11, r11, 16 - lvx v31, r11, r3 - li r11, 0 + ld r14, 0(SP) + ld r15, 8(SP) + ld r16, 16(SP) + ld r17, 24(SP) + ld r18, 32(SP) + ld r19, 40(SP) + ld r20, 48(SP) + ld r21, 56(SP) + ld r22, 64(SP) + ld r23, 72(SP) + ld r24, 80(SP) + ld r25, 88(SP) + ld r26, 96(SP) + ld r27, 104(SP) + ld r28, 112(SP) + ld r29, 120(SP) + ld r30, 128(SP) + ld r31, 136(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/zgemm_tcopy_8_power8.S b/kernel/power/zgemm_tcopy_8_power8.S index 2841a9921..164c6443e 100644 --- a/kernel/power/zgemm_tcopy_8_power8.S +++ b/kernel/power/zgemm_tcopy_8_power8.S @@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "zgemm_tcopy_macros_8_power8.S" -#define STACKSIZE 384 -#define STACKSIZE 576 +#define STACKSIZE 144 + PROLOGUE @@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi SP, SP, -STACKSIZE li r0, 0 - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - addi r11, SP ,288 - stvx v20, r11, r0 - addi r11, r11, 16 - stvx v21, r11, r0 - addi r11, r11, 16 - stvx v22, r11, r0 - addi r11, r11, 16 - stvx v23, r11, r0 - addi r11, r11, 16 - stvx v24, r11, r0 - addi r11, r11, 16 - stvx v25, r11, r0 - addi r11, r11, 16 - stvx v26, r11, r0 - addi r11, r11, 16 - stvx v27, r11, r0 - addi r11, r11, 16 - stvx v28, r11, r0 - addi r11, r11, 16 - stvx v29, r11, r0 - addi r11, r11, 16 - stvx v30, r11, r0 - addi r11, r11 ,16 - stvx v31, r11, r0 - li r11,0 + std r14, 0(SP) + std r15, 8(SP) + std r16, 16(SP) + std r17, 24(SP) + std r18, 32(SP) + std r19, 40(SP) + std r20, 48(SP) + std r21, 56(SP) + std r22, 64(SP) + std r23, 72(SP) + std r24, 80(SP) + std r25, 88(SP) + std r26, 96(SP) + std r27, 104(SP) + std r28, 112(SP) + std r29, 120(SP) + std r30, 128(SP) + std r31, 136(SP) + cmpwi cr0, M, 0 ble- L999 @@ -204,49 +180,24 @@ L999: li r3, 0 - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - addi r11, SP, 288 - lvx v20, r11,r3 - addi r11, r11, 16 - lvx v21, r11, r3 - addi r11, r11, 16 - lvx v22, r11, r3 - addi r11, r11, 16 - lvx v23, r11, r3 - addi r11, r11, 16 - lvx v24, r11, r3 - addi r11, r11, 16 - lvx v25, r11, r3 - addi r11, r11, 16 - lvx v26, r11, r3 - addi r11, r11, 16 - lvx v27, r11, r3 - addi r11, r11, 16 - lvx v28, r11, r3 - addi r11, r11, 16 - lvx v29, r11, r3 - addi r11, r11, 16 - lvx v30, r11, r3 - addi r11, r11, 16 - lvx v31, r11, r3 - li r11,0 + ld r14, 0(SP) + ld r15, 8(SP) + ld r16, 16(SP) + ld r17, 24(SP) + ld r18, 32(SP) + ld r19, 40(SP) + ld r20, 48(SP) + ld r21, 56(SP) + ld r22, 64(SP) + ld r23, 72(SP) + ld r24, 80(SP) + ld r25, 88(SP) + ld r26, 96(SP) + ld r27, 104(SP) + ld r28, 112(SP) + ld r29, 120(SP) + ld r30, 128(SP) + ld r31, 136(SP) addi SP, SP, STACKSIZE blr diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S index e8c2f0baa..3f5a5ed03 100644 --- a/kernel/power/zgemm_tcopy_macros_8_power8.S +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs51, o48, A2 addi A2, A2, 64 - lxvd2x vs52, o0, A2 - lxvd2x vs53, o16, A2 - lxvd2x vs54, o32, A2 - lxvd2x vs55, o48, A2 + lxvd2x vs2, o0, A2 + lxvd2x vs3, o16, A2 + lxvd2x vs4, o32, A2 + lxvd2x vs5, o48, A2 addi A2, A2, 64 - lxvd2x vs56, o0, A3 - lxvd2x vs57, o16, A3 - lxvd2x vs58, o32, A3 - lxvd2x vs59, o48, A3 + lxvd2x vs6, o0, A3 + lxvd2x vs7, o16, A3 + lxvd2x vs8, o32, A3 + lxvd2x vs9, o48, A3 addi A3, A3, 64 - lxvd2x vs60, o0, A3 - lxvd2x vs61, o16, A3 - lxvd2x vs62, o32, A3 - lxvd2x vs63, o48, A3 + lxvd2x vs10, o0, A3 + lxvd2x vs11, o16, A3 + lxvd2x vs12, o32, A3 + lxvd2x vs13, o48, A3 addi A3, A3, 64 @@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs51, o48, T1 addi T1, T1, 64 - stxvd2x vs52, o0, T1 - stxvd2x vs53, o16, T1 - stxvd2x vs54, o32, T1 - stxvd2x vs55, o48, T1 + stxvd2x vs2, o0, T1 + stxvd2x vs3, o16, T1 + stxvd2x vs4, o32, T1 + stxvd2x vs5, o48, T1 addi T1, T1, 64 - stxvd2x vs56, o0, T1 - stxvd2x vs57, o16, T1 - stxvd2x vs58, o32, T1 - stxvd2x vs59, o48, T1 + stxvd2x vs6, o0, T1 + stxvd2x vs7, o16, T1 + stxvd2x vs8, o32, T1 + stxvd2x vs9, o48, T1 addi T1, T1, 64 - stxvd2x vs60, o0, T1 - stxvd2x vs61, o16, T1 - stxvd2x vs62, o32, T1 - stxvd2x vs63, o48, T1 + stxvd2x vs10, o0, T1 + stxvd2x vs11, o16, T1 + stxvd2x vs12, o32, T1 + stxvd2x vs13, o48, T1 .endm diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index f57034aef..e29f51012 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,8 +27,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" +#ifdef Z13_A +static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +{ - + + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v0,%%r0,%%r0 \n\t" + "srlg %[n],%[n],4 \n\t" + "vlr %%v1,%%v0 \n\t" + "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "la %[x_ptr], 128(%[x_ptr]) \n\t" + "aghik %[n], %[n], -1 \n\t" + "jle 2f \n\t" + ".align 16 \n\t" + "1: \n\t" + "vfmdb %%v24, %%v16, %%v0 \n\t" + "vfmdb %%v25, %%v17, %%v0 \n\t" + "vfmdb %%v26, %%v18, %%v0 \n\t" + "vfmdb %%v27, %%v19, %%v1 \n\t" + "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" + "vfmdb %%v28, %%v20, %%v0 \n\t" + "vfmdb %%v29, %%v21, %%v1 \n\t" + "vfmdb %%v30, %%v22, %%v0 \n\t" + "vfmdb %%v31, %%v23, %%v1 \n\t" + "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" + "lay %[x_ptr], -128(%[x_ptr]) \n\t" + "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" + "la %[x_ptr],256(%[x_ptr]) \n\t" + "brctg %[n],1b \n\t" + "2: \n\t" + "vfmdb %%v24, %%v16, %%v0 \n\t" + "vfmdb %%v25, %%v17, %%v1 \n\t" + "vfmdb %%v26, %%v18, %%v0 \n\t" + "vfmdb %%v27, %%v19, %%v1 \n\t" + "lay %[x_ptr] , -128(%[x_ptr]) \n\t" + "vfmdb %%v28, %%v20, %%v0 \n\t" + "vfmdb %%v29, %%v21, %%v1 \n\t" + "vfmdb %%v30, %%v22, %%v0 \n\t" + "vfmdb %%v31, %%v23, %%v1 \n\t" + "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" + : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) + : [alpha] "f"(da) + :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", + "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + } +#else static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) { @@ -71,7 +117,7 @@ static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) ); } - +#endif static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) { @@ -214,6 +260,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} - - +} \ No newline at end of file