power8 ?gemm_tcopy save/restore

This commit is contained in:
the mslm 2018-02-16 09:56:08 +04:00 committed by Ubuntu
parent 60596a1abc
commit c5425daa6b
8 changed files with 287 additions and 500 deletions

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "cgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "dgemm_tcopy_macros_16_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
//addi SP, SP, -208
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11,SP,288
stvx v20, r11,r0
addi r11,r11,16
stvx v21, r11,r0
addi r11,r11,16
stvx v22, r11,r0
addi r11,r11,16
stvx v23, r11,r0
addi r11,r11,16
stvx v24, r11,r0
addi r11,r11,16
stvx v25, r11,r0
addi r11,r11,16
stvx v26, r11,r0
addi r11,r11,16
stvx v27, r11,r0
addi r11,r11,16
stvx v28, r11,r0
addi r11,r11,16
stvx v29, r11,r0
addi r11,r11,16
stvx v30, r11,r0
addi r11,r11,16
stvx v31, r11,r0
li r11,0
std r14,0(SP)
std r15,8(SP)
std r16,16(SP)
std r17,24(SP)
std r18,32(SP)
std r19,40(SP)
std r20,48(SP)
std r21,56(SP)
std r22,64(SP)
std r23,72(SP)
std r24,80(SP)
std r25,88(SP)
std r26,96(SP)
std r27,104(SP)
std r28,112(SP)
std r29,120(SP)
std r30,128(SP)
std r31,136(SP)
cmpwi cr0, M, 0
ble- L999
@ -198,8 +172,7 @@ li r11,0
add B2, B2, B
add B1, B1, B
//li PREA, 384
li PREA, 576
li PREA, 384
addi PREB, M16, 128
li o8, 8
@ -213,52 +186,27 @@ L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11,r3
addi r11,r11,16
lvx v21, r11,r3
addi r11,r11,16
lvx v22, r11,r3
addi r11,r11,16
lvx v23, r11,r3
addi r11,r11,16
lvx v24, r11,r3
addi r11,r11,16
lvx v25, r11,r3
addi r11,r11,16
lvx v26, r11,r3
addi r11,r11,16
lvx v27, r11,r3
addi r11,r11,16
lvx v28, r11,r3
addi r11,r11,16
lvx v29, r11,r3
addi r11,r11,16
lvx v30, r11,r3
addi r11,r11,16
lvx v31, r11,r3
li r11,0
ld r14,0(SP)
ld r15,8(SP)
ld r16,16(SP)
ld r17,24(SP)
ld r18,32(SP)
ld r19,40(SP)
ld r20,48(SP)
ld r21,56(SP)
ld r22,64(SP)
ld r23,72(SP)
ld r24,80(SP)
ld r25,88(SP)
ld r26,96(SP)
ld r27,104(SP)
ld r28,112(SP)
ld r29,120(SP)
ld r30,128(SP)
ld r31,136(SP)
addi SP, SP, STACKSIZE
//addi SP, SP, 208
blr
EPILOGUE

View File

@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
lxvd2x vs4, o0, A3
lxvd2x vs5, o16, A3
lxvd2x vs6, o32, A3
lxvd2x vs7, o48, A3
addi A3, A3, 64
lxvd2x vs36, o0, A0
@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs47, o48, A1
addi A1, A1, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
lxvd2x vs12, o0, A2
lxvd2x vs13, o16, A2
lxvd2x vs2, o32, A2
lxvd2x vs3, o48, A2
addi A2, A2, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
lxvd2x vs8, o0, A3
lxvd2x vs9, o16, A3
lxvd2x vs10, o32, A3
lxvd2x vs11, o48, A3
addi A3, A3, 64
mr T1, BO
@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
stxvd2x vs12, o0, T1
stxvd2x vs13, o16, T1
stxvd2x vs2, o32, T1
stxvd2x vs3, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
stxvd2x vs4, o0, T1
stxvd2x vs5, o16, T1
stxvd2x vs6, o32, T1
stxvd2x vs7, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
.endm

View File

@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemm_tcopy_macros_16_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11 ,SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "sgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
PROFCODE
addi SP, SP, -STACKSIZE
li r0, 0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP, 288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11, 16
stvx v31, r11, r0
li r11, 0
cmpwi cr0, M, 0
ble- L999
@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11,SP,288
lvx v20, r11, r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11, 0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "zgemm_tcopy_macros_8_power8.S"
#define STACKSIZE 384
#define STACKSIZE 576
#define STACKSIZE 144
PROLOGUE
@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi SP, SP, -STACKSIZE
li r0, 0
std r31, 144(SP)
std r30, 152(SP)
std r29, 160(SP)
std r28, 168(SP)
std r27, 176(SP)
std r26, 184(SP)
std r25, 192(SP)
std r24, 200(SP)
std r23, 208(SP)
std r22, 216(SP)
std r21, 224(SP)
std r20, 232(SP)
std r19, 240(SP)
std r18, 248(SP)
std r17, 256(SP)
std r16, 264(SP)
std r15, 272(SP)
std r14, 280(SP)
addi r11, SP ,288
stvx v20, r11, r0
addi r11, r11, 16
stvx v21, r11, r0
addi r11, r11, 16
stvx v22, r11, r0
addi r11, r11, 16
stvx v23, r11, r0
addi r11, r11, 16
stvx v24, r11, r0
addi r11, r11, 16
stvx v25, r11, r0
addi r11, r11, 16
stvx v26, r11, r0
addi r11, r11, 16
stvx v27, r11, r0
addi r11, r11, 16
stvx v28, r11, r0
addi r11, r11, 16
stvx v29, r11, r0
addi r11, r11, 16
stvx v30, r11, r0
addi r11, r11 ,16
stvx v31, r11, r0
li r11,0
std r14, 0(SP)
std r15, 8(SP)
std r16, 16(SP)
std r17, 24(SP)
std r18, 32(SP)
std r19, 40(SP)
std r20, 48(SP)
std r21, 56(SP)
std r22, 64(SP)
std r23, 72(SP)
std r24, 80(SP)
std r25, 88(SP)
std r26, 96(SP)
std r27, 104(SP)
std r28, 112(SP)
std r29, 120(SP)
std r30, 128(SP)
std r31, 136(SP)
cmpwi cr0, M, 0
ble- L999
@ -204,49 +180,24 @@ L999:
li r3, 0
ld r31, 144(SP)
ld r30, 152(SP)
ld r29, 160(SP)
ld r28, 168(SP)
ld r27, 176(SP)
ld r26, 184(SP)
ld r25, 192(SP)
ld r24, 200(SP)
ld r23, 208(SP)
ld r22, 216(SP)
ld r21, 224(SP)
ld r20, 232(SP)
ld r19, 240(SP)
ld r18, 248(SP)
ld r17, 256(SP)
ld r16, 264(SP)
ld r15, 272(SP)
ld r14, 280(SP)
addi r11, SP, 288
lvx v20, r11,r3
addi r11, r11, 16
lvx v21, r11, r3
addi r11, r11, 16
lvx v22, r11, r3
addi r11, r11, 16
lvx v23, r11, r3
addi r11, r11, 16
lvx v24, r11, r3
addi r11, r11, 16
lvx v25, r11, r3
addi r11, r11, 16
lvx v26, r11, r3
addi r11, r11, 16
lvx v27, r11, r3
addi r11, r11, 16
lvx v28, r11, r3
addi r11, r11, 16
lvx v29, r11, r3
addi r11, r11, 16
lvx v30, r11, r3
addi r11, r11, 16
lvx v31, r11, r3
li r11,0
ld r14, 0(SP)
ld r15, 8(SP)
ld r16, 16(SP)
ld r17, 24(SP)
ld r18, 32(SP)
ld r19, 40(SP)
ld r20, 48(SP)
ld r21, 56(SP)
ld r22, 64(SP)
ld r23, 72(SP)
ld r24, 80(SP)
ld r25, 88(SP)
ld r26, 96(SP)
ld r27, 104(SP)
ld r28, 112(SP)
ld r29, 120(SP)
ld r30, 128(SP)
ld r31, 136(SP)
addi SP, SP, STACKSIZE
blr

View File

@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
lxvd2x vs51, o48, A2
addi A2, A2, 64
lxvd2x vs52, o0, A2
lxvd2x vs53, o16, A2
lxvd2x vs54, o32, A2
lxvd2x vs55, o48, A2
lxvd2x vs2, o0, A2
lxvd2x vs3, o16, A2
lxvd2x vs4, o32, A2
lxvd2x vs5, o48, A2
addi A2, A2, 64
lxvd2x vs56, o0, A3
lxvd2x vs57, o16, A3
lxvd2x vs58, o32, A3
lxvd2x vs59, o48, A3
lxvd2x vs6, o0, A3
lxvd2x vs7, o16, A3
lxvd2x vs8, o32, A3
lxvd2x vs9, o48, A3
addi A3, A3, 64
lxvd2x vs60, o0, A3
lxvd2x vs61, o16, A3
lxvd2x vs62, o32, A3
lxvd2x vs63, o48, A3
lxvd2x vs10, o0, A3
lxvd2x vs11, o16, A3
lxvd2x vs12, o32, A3
lxvd2x vs13, o48, A3
addi A3, A3, 64
@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs51, o48, T1
addi T1, T1, 64
stxvd2x vs52, o0, T1
stxvd2x vs53, o16, T1
stxvd2x vs54, o32, T1
stxvd2x vs55, o48, T1
stxvd2x vs2, o0, T1
stxvd2x vs3, o16, T1
stxvd2x vs4, o32, T1
stxvd2x vs5, o48, T1
addi T1, T1, 64
stxvd2x vs56, o0, T1
stxvd2x vs57, o16, T1
stxvd2x vs58, o32, T1
stxvd2x vs59, o48, T1
stxvd2x vs6, o0, T1
stxvd2x vs7, o16, T1
stxvd2x vs8, o32, T1
stxvd2x vs9, o48, T1
addi T1, T1, 64
stxvd2x vs60, o0, T1
stxvd2x vs61, o16, T1
stxvd2x vs62, o32, T1
stxvd2x vs63, o48, T1
stxvd2x vs10, o0, T1
stxvd2x vs11, o16, T1
stxvd2x vs12, o32, T1
stxvd2x vs13, o48, T1
.endm

View File

@ -27,8 +27,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h"
#ifdef Z13_A
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
"lgdr %%r0,%[alpha] \n\t"
"vlvgp %%v0,%%r0,%%r0 \n\t"
"srlg %[n],%[n],4 \n\t"
"vlr %%v1,%%v0 \n\t"
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
"la %[x_ptr], 128(%[x_ptr]) \n\t"
"aghik %[n], %[n], -1 \n\t"
"jle 2f \n\t"
".align 16 \n\t"
"1: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v0 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
"la %[x_ptr],256(%[x_ptr]) \n\t"
"brctg %[n],1b \n\t"
"2: \n\t"
"vfmdb %%v24, %%v16, %%v0 \n\t"
"vfmdb %%v25, %%v17, %%v1 \n\t"
"vfmdb %%v26, %%v18, %%v0 \n\t"
"vfmdb %%v27, %%v19, %%v1 \n\t"
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
"vfmdb %%v28, %%v20, %%v0 \n\t"
"vfmdb %%v29, %%v21, %%v1 \n\t"
"vfmdb %%v30, %%v22, %%v0 \n\t"
"vfmdb %%v31, %%v23, %%v1 \n\t"
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
: [alpha] "f"(da)
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
);
}
#else
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
{
@ -71,7 +117,7 @@ static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
);
}
#endif
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
{
@ -214,6 +260,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
return 0;
}
}