power8 ?gemm_tcopy save/restore
This commit is contained in:
parent
60596a1abc
commit
c5425daa6b
|
@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "cgemm_tcopy_macros_8_power8.S"
|
#include "cgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
#define STACKSIZE 576
|
#define STACKSIZE 144
|
||||||
|
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
@ -119,49 +119,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
li r0, 0
|
li r0, 0
|
||||||
|
|
||||||
std r31, 144(SP)
|
std r14, 0(SP)
|
||||||
std r30, 152(SP)
|
std r15, 8(SP)
|
||||||
std r29, 160(SP)
|
std r16, 16(SP)
|
||||||
std r28, 168(SP)
|
std r17, 24(SP)
|
||||||
std r27, 176(SP)
|
std r18, 32(SP)
|
||||||
std r26, 184(SP)
|
std r19, 40(SP)
|
||||||
std r25, 192(SP)
|
std r20, 48(SP)
|
||||||
std r24, 200(SP)
|
std r21, 56(SP)
|
||||||
std r23, 208(SP)
|
std r22, 64(SP)
|
||||||
std r22, 216(SP)
|
std r23, 72(SP)
|
||||||
std r21, 224(SP)
|
std r24, 80(SP)
|
||||||
std r20, 232(SP)
|
std r25, 88(SP)
|
||||||
std r19, 240(SP)
|
std r26, 96(SP)
|
||||||
std r18, 248(SP)
|
std r27, 104(SP)
|
||||||
std r17, 256(SP)
|
std r28, 112(SP)
|
||||||
std r16, 264(SP)
|
std r29, 120(SP)
|
||||||
std r15, 272(SP)
|
std r30, 128(SP)
|
||||||
std r14, 280(SP)
|
std r31, 136(SP)
|
||||||
addi r11, SP, 288
|
|
||||||
stvx v20, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v21, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v22, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v23, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v24, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v25, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v26, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v27, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v28, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v29, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v30, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v31, r11, r0
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble- L999
|
ble- L999
|
||||||
|
@ -203,51 +178,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
L999:
|
L999:
|
||||||
|
|
||||||
li r3, 0
|
ld r14, 0(SP)
|
||||||
|
ld r15, 8(SP)
|
||||||
ld r31, 144(SP)
|
ld r16, 16(SP)
|
||||||
ld r30, 152(SP)
|
ld r17, 24(SP)
|
||||||
ld r29, 160(SP)
|
ld r18, 32(SP)
|
||||||
ld r28, 168(SP)
|
ld r19, 40(SP)
|
||||||
ld r27, 176(SP)
|
ld r20, 48(SP)
|
||||||
ld r26, 184(SP)
|
ld r21, 56(SP)
|
||||||
ld r25, 192(SP)
|
ld r22, 64(SP)
|
||||||
ld r24, 200(SP)
|
ld r23, 72(SP)
|
||||||
ld r23, 208(SP)
|
ld r24, 80(SP)
|
||||||
ld r22, 216(SP)
|
ld r25, 88(SP)
|
||||||
ld r21, 224(SP)
|
ld r26, 96(SP)
|
||||||
ld r20, 232(SP)
|
ld r27, 104(SP)
|
||||||
ld r19, 240(SP)
|
ld r28, 112(SP)
|
||||||
ld r18, 248(SP)
|
ld r29, 120(SP)
|
||||||
ld r17, 256(SP)
|
ld r30, 128(SP)
|
||||||
ld r16, 264(SP)
|
ld r31, 136(SP)
|
||||||
ld r15, 272(SP)
|
|
||||||
ld r14, 280(SP)
|
|
||||||
addi r11, SP, 288
|
|
||||||
lvx v20, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v21, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v22, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v23, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v24, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v25, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v26, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v27, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v28, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v29, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v30, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v31, r11, r3
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -109,61 +109,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "dgemm_tcopy_macros_16_power8.S"
|
#include "dgemm_tcopy_macros_16_power8.S"
|
||||||
|
|
||||||
#define STACKSIZE 384
|
#define STACKSIZE 144
|
||||||
#define STACKSIZE 576
|
|
||||||
|
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
//addi SP, SP, -208
|
|
||||||
|
|
||||||
li r0, 0
|
li r0, 0
|
||||||
|
|
||||||
std r31, 144(SP)
|
std r14,0(SP)
|
||||||
std r30, 152(SP)
|
std r15,8(SP)
|
||||||
std r29, 160(SP)
|
std r16,16(SP)
|
||||||
std r28, 168(SP)
|
std r17,24(SP)
|
||||||
std r27, 176(SP)
|
std r18,32(SP)
|
||||||
std r26, 184(SP)
|
std r19,40(SP)
|
||||||
std r25, 192(SP)
|
std r20,48(SP)
|
||||||
std r24, 200(SP)
|
std r21,56(SP)
|
||||||
std r23, 208(SP)
|
std r22,64(SP)
|
||||||
std r22, 216(SP)
|
std r23,72(SP)
|
||||||
std r21, 224(SP)
|
std r24,80(SP)
|
||||||
std r20, 232(SP)
|
std r25,88(SP)
|
||||||
std r19, 240(SP)
|
std r26,96(SP)
|
||||||
std r18, 248(SP)
|
std r27,104(SP)
|
||||||
std r17, 256(SP)
|
std r28,112(SP)
|
||||||
std r16, 264(SP)
|
std r29,120(SP)
|
||||||
std r15, 272(SP)
|
std r30,128(SP)
|
||||||
std r14, 280(SP)
|
std r31,136(SP)
|
||||||
addi r11,SP,288
|
|
||||||
stvx v20, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v21, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v22, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v23, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v24, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v25, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v26, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v27, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v28, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v29, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v30, r11,r0
|
|
||||||
addi r11,r11,16
|
|
||||||
stvx v31, r11,r0
|
|
||||||
li r11,0
|
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble- L999
|
ble- L999
|
||||||
|
@ -198,8 +172,7 @@ li r11,0
|
||||||
add B2, B2, B
|
add B2, B2, B
|
||||||
add B1, B1, B
|
add B1, B1, B
|
||||||
|
|
||||||
//li PREA, 384
|
li PREA, 384
|
||||||
li PREA, 576
|
|
||||||
addi PREB, M16, 128
|
addi PREB, M16, 128
|
||||||
|
|
||||||
li o8, 8
|
li o8, 8
|
||||||
|
@ -213,52 +186,27 @@ L999:
|
||||||
|
|
||||||
li r3, 0
|
li r3, 0
|
||||||
|
|
||||||
ld r31, 144(SP)
|
ld r14,0(SP)
|
||||||
ld r30, 152(SP)
|
ld r15,8(SP)
|
||||||
ld r29, 160(SP)
|
ld r16,16(SP)
|
||||||
ld r28, 168(SP)
|
ld r17,24(SP)
|
||||||
ld r27, 176(SP)
|
ld r18,32(SP)
|
||||||
ld r26, 184(SP)
|
ld r19,40(SP)
|
||||||
ld r25, 192(SP)
|
ld r20,48(SP)
|
||||||
ld r24, 200(SP)
|
ld r21,56(SP)
|
||||||
ld r23, 208(SP)
|
ld r22,64(SP)
|
||||||
ld r22, 216(SP)
|
ld r23,72(SP)
|
||||||
ld r21, 224(SP)
|
ld r24,80(SP)
|
||||||
ld r20, 232(SP)
|
ld r25,88(SP)
|
||||||
ld r19, 240(SP)
|
ld r26,96(SP)
|
||||||
ld r18, 248(SP)
|
ld r27,104(SP)
|
||||||
ld r17, 256(SP)
|
ld r28,112(SP)
|
||||||
ld r16, 264(SP)
|
ld r29,120(SP)
|
||||||
ld r15, 272(SP)
|
ld r30,128(SP)
|
||||||
ld r14, 280(SP)
|
ld r31,136(SP)
|
||||||
addi r11,SP,288
|
|
||||||
lvx v20, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v21, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v22, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v23, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v24, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v25, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v26, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v27, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v28, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v29, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v30, r11,r3
|
|
||||||
addi r11,r11,16
|
|
||||||
lvx v31, r11,r3
|
|
||||||
li r11,0
|
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
//addi SP, SP, 208
|
|
||||||
blr
|
blr
|
||||||
EPILOGUE
|
EPILOGUE
|
||||||
|
|
||||||
|
|
|
@ -58,10 +58,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs51, o48, A2
|
lxvd2x vs51, o48, A2
|
||||||
addi A2, A2, 64
|
addi A2, A2, 64
|
||||||
|
|
||||||
lxvd2x vs56, o0, A3
|
lxvd2x vs4, o0, A3
|
||||||
lxvd2x vs57, o16, A3
|
lxvd2x vs5, o16, A3
|
||||||
lxvd2x vs58, o32, A3
|
lxvd2x vs6, o32, A3
|
||||||
lxvd2x vs59, o48, A3
|
lxvd2x vs7, o48, A3
|
||||||
addi A3, A3, 64
|
addi A3, A3, 64
|
||||||
|
|
||||||
lxvd2x vs36, o0, A0
|
lxvd2x vs36, o0, A0
|
||||||
|
@ -76,16 +76,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs47, o48, A1
|
lxvd2x vs47, o48, A1
|
||||||
addi A1, A1, 64
|
addi A1, A1, 64
|
||||||
|
|
||||||
lxvd2x vs52, o0, A2
|
lxvd2x vs12, o0, A2
|
||||||
lxvd2x vs53, o16, A2
|
lxvd2x vs13, o16, A2
|
||||||
lxvd2x vs54, o32, A2
|
lxvd2x vs2, o32, A2
|
||||||
lxvd2x vs55, o48, A2
|
lxvd2x vs3, o48, A2
|
||||||
addi A2, A2, 64
|
addi A2, A2, 64
|
||||||
|
|
||||||
lxvd2x vs60, o0, A3
|
lxvd2x vs8, o0, A3
|
||||||
lxvd2x vs61, o16, A3
|
lxvd2x vs9, o16, A3
|
||||||
lxvd2x vs62, o32, A3
|
lxvd2x vs10, o32, A3
|
||||||
lxvd2x vs63, o48, A3
|
lxvd2x vs11, o48, A3
|
||||||
addi A3, A3, 64
|
addi A3, A3, 64
|
||||||
|
|
||||||
mr T1, BO
|
mr T1, BO
|
||||||
|
@ -122,23 +122,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs51, o48, T1
|
stxvd2x vs51, o48, T1
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs52, o0, T1
|
stxvd2x vs12, o0, T1
|
||||||
stxvd2x vs53, o16, T1
|
stxvd2x vs13, o16, T1
|
||||||
stxvd2x vs54, o32, T1
|
stxvd2x vs2, o32, T1
|
||||||
stxvd2x vs55, o48, T1
|
stxvd2x vs3, o48, T1
|
||||||
|
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs56, o0, T1
|
stxvd2x vs4, o0, T1
|
||||||
stxvd2x vs57, o16, T1
|
stxvd2x vs5, o16, T1
|
||||||
stxvd2x vs58, o32, T1
|
stxvd2x vs6, o32, T1
|
||||||
stxvd2x vs59, o48, T1
|
stxvd2x vs7, o48, T1
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs60, o0, T1
|
stxvd2x vs8, o0, T1
|
||||||
stxvd2x vs61, o16, T1
|
stxvd2x vs9, o16, T1
|
||||||
stxvd2x vs62, o32, T1
|
stxvd2x vs10, o32, T1
|
||||||
stxvd2x vs63, o48, T1
|
stxvd2x vs11, o48, T1
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
|
@ -110,7 +110,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "sgemm_tcopy_macros_16_power8.S"
|
#include "sgemm_tcopy_macros_16_power8.S"
|
||||||
|
|
||||||
#define STACKSIZE 576
|
#define STACKSIZE 144
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
@ -118,49 +118,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
li r0, 0
|
li r0, 0
|
||||||
|
|
||||||
std r31, 144(SP)
|
std r14, 0(SP)
|
||||||
std r30, 152(SP)
|
std r15, 8(SP)
|
||||||
std r29, 160(SP)
|
std r16, 16(SP)
|
||||||
std r28, 168(SP)
|
std r17, 24(SP)
|
||||||
std r27, 176(SP)
|
std r18, 32(SP)
|
||||||
std r26, 184(SP)
|
std r19, 40(SP)
|
||||||
std r25, 192(SP)
|
std r20, 48(SP)
|
||||||
std r24, 200(SP)
|
std r21, 56(SP)
|
||||||
std r23, 208(SP)
|
std r22, 64(SP)
|
||||||
std r22, 216(SP)
|
std r23, 72(SP)
|
||||||
std r21, 224(SP)
|
std r24, 80(SP)
|
||||||
std r20, 232(SP)
|
std r25, 88(SP)
|
||||||
std r19, 240(SP)
|
std r26, 96(SP)
|
||||||
std r18, 248(SP)
|
std r27, 104(SP)
|
||||||
std r17, 256(SP)
|
std r28, 112(SP)
|
||||||
std r16, 264(SP)
|
std r29, 120(SP)
|
||||||
std r15, 272(SP)
|
std r30, 128(SP)
|
||||||
std r14, 280(SP)
|
std r31, 136(SP)
|
||||||
addi r11 ,SP, 288
|
|
||||||
stvx v20, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v21, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v22, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v23, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v24, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v25, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v26, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v27, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v28, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v29, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v30, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v31, r11, r0
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble- L999
|
ble- L999
|
||||||
|
@ -207,51 +182,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
L999:
|
L999:
|
||||||
|
|
||||||
li r3, 0
|
ld r14, 0(SP)
|
||||||
|
ld r15, 8(SP)
|
||||||
ld r31, 144(SP)
|
ld r16, 16(SP)
|
||||||
ld r30, 152(SP)
|
ld r17, 24(SP)
|
||||||
ld r29, 160(SP)
|
ld r18, 32(SP)
|
||||||
ld r28, 168(SP)
|
ld r19, 40(SP)
|
||||||
ld r27, 176(SP)
|
ld r20, 48(SP)
|
||||||
ld r26, 184(SP)
|
ld r21, 56(SP)
|
||||||
ld r25, 192(SP)
|
ld r22, 64(SP)
|
||||||
ld r24, 200(SP)
|
ld r23, 72(SP)
|
||||||
ld r23, 208(SP)
|
ld r24, 80(SP)
|
||||||
ld r22, 216(SP)
|
ld r25, 88(SP)
|
||||||
ld r21, 224(SP)
|
ld r26, 96(SP)
|
||||||
ld r20, 232(SP)
|
ld r27, 104(SP)
|
||||||
ld r19, 240(SP)
|
ld r28, 112(SP)
|
||||||
ld r18, 248(SP)
|
ld r29, 120(SP)
|
||||||
ld r17, 256(SP)
|
ld r30, 128(SP)
|
||||||
ld r16, 264(SP)
|
ld r31, 136(SP)
|
||||||
ld r15, 272(SP)
|
|
||||||
ld r14, 280(SP)
|
|
||||||
addi r11, SP, 288
|
|
||||||
lvx v20, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v21, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v22, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v23, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v24, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v25, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v26, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v27, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v28, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v29, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v30, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v31, r11, r3
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -110,57 +110,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "sgemm_tcopy_macros_8_power8.S"
|
#include "sgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
#define STACKSIZE 576
|
#define STACKSIZE 144
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
PROFCODE
|
PROFCODE
|
||||||
|
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
li r0, 0
|
li r0, 0
|
||||||
|
std r14, 0(SP)
|
||||||
|
std r15, 8(SP)
|
||||||
|
std r16, 16(SP)
|
||||||
|
std r17, 24(SP)
|
||||||
|
std r18, 32(SP)
|
||||||
|
std r19, 40(SP)
|
||||||
|
std r20, 48(SP)
|
||||||
|
std r21, 56(SP)
|
||||||
|
std r22, 64(SP)
|
||||||
|
std r23, 72(SP)
|
||||||
|
std r24, 80(SP)
|
||||||
|
std r25, 88(SP)
|
||||||
|
std r26, 96(SP)
|
||||||
|
std r27, 104(SP)
|
||||||
|
std r28, 112(SP)
|
||||||
|
std r29, 120(SP)
|
||||||
|
std r30, 128(SP)
|
||||||
|
std r31, 136(SP)
|
||||||
|
|
||||||
std r31, 144(SP)
|
|
||||||
std r30, 152(SP)
|
|
||||||
std r29, 160(SP)
|
|
||||||
std r28, 168(SP)
|
|
||||||
std r27, 176(SP)
|
|
||||||
std r26, 184(SP)
|
|
||||||
std r25, 192(SP)
|
|
||||||
std r24, 200(SP)
|
|
||||||
std r23, 208(SP)
|
|
||||||
std r22, 216(SP)
|
|
||||||
std r21, 224(SP)
|
|
||||||
std r20, 232(SP)
|
|
||||||
std r19, 240(SP)
|
|
||||||
std r18, 248(SP)
|
|
||||||
std r17, 256(SP)
|
|
||||||
std r16, 264(SP)
|
|
||||||
std r15, 272(SP)
|
|
||||||
std r14, 280(SP)
|
|
||||||
addi r11, SP, 288
|
|
||||||
stvx v20, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v21, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v22, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v23, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v24, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v25, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v26, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v27, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v28, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v29, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v30, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v31, r11, r0
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble- L999
|
ble- L999
|
||||||
|
@ -202,51 +177,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
L999:
|
L999:
|
||||||
|
|
||||||
li r3, 0
|
ld r14, 0(SP)
|
||||||
|
ld r15, 8(SP)
|
||||||
ld r31, 144(SP)
|
ld r16, 16(SP)
|
||||||
ld r30, 152(SP)
|
ld r17, 24(SP)
|
||||||
ld r29, 160(SP)
|
ld r18, 32(SP)
|
||||||
ld r28, 168(SP)
|
ld r19, 40(SP)
|
||||||
ld r27, 176(SP)
|
ld r20, 48(SP)
|
||||||
ld r26, 184(SP)
|
ld r21, 56(SP)
|
||||||
ld r25, 192(SP)
|
ld r22, 64(SP)
|
||||||
ld r24, 200(SP)
|
ld r23, 72(SP)
|
||||||
ld r23, 208(SP)
|
ld r24, 80(SP)
|
||||||
ld r22, 216(SP)
|
ld r25, 88(SP)
|
||||||
ld r21, 224(SP)
|
ld r26, 96(SP)
|
||||||
ld r20, 232(SP)
|
ld r27, 104(SP)
|
||||||
ld r19, 240(SP)
|
ld r28, 112(SP)
|
||||||
ld r18, 248(SP)
|
ld r29, 120(SP)
|
||||||
ld r17, 256(SP)
|
ld r30, 128(SP)
|
||||||
ld r16, 264(SP)
|
ld r31, 136(SP)
|
||||||
ld r15, 272(SP)
|
|
||||||
ld r14, 280(SP)
|
|
||||||
addi r11,SP,288
|
|
||||||
lvx v20, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v21, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v22, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v23, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v24, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v25, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v26, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v27, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v28, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v29, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v30, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v31, r11, r3
|
|
||||||
li r11, 0
|
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -109,8 +109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "zgemm_tcopy_macros_8_power8.S"
|
#include "zgemm_tcopy_macros_8_power8.S"
|
||||||
|
|
||||||
#define STACKSIZE 384
|
#define STACKSIZE 144
|
||||||
#define STACKSIZE 576
|
|
||||||
|
|
||||||
|
|
||||||
PROLOGUE
|
PROLOGUE
|
||||||
|
@ -119,49 +119,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
addi SP, SP, -STACKSIZE
|
addi SP, SP, -STACKSIZE
|
||||||
li r0, 0
|
li r0, 0
|
||||||
|
|
||||||
std r31, 144(SP)
|
std r14, 0(SP)
|
||||||
std r30, 152(SP)
|
std r15, 8(SP)
|
||||||
std r29, 160(SP)
|
std r16, 16(SP)
|
||||||
std r28, 168(SP)
|
std r17, 24(SP)
|
||||||
std r27, 176(SP)
|
std r18, 32(SP)
|
||||||
std r26, 184(SP)
|
std r19, 40(SP)
|
||||||
std r25, 192(SP)
|
std r20, 48(SP)
|
||||||
std r24, 200(SP)
|
std r21, 56(SP)
|
||||||
std r23, 208(SP)
|
std r22, 64(SP)
|
||||||
std r22, 216(SP)
|
std r23, 72(SP)
|
||||||
std r21, 224(SP)
|
std r24, 80(SP)
|
||||||
std r20, 232(SP)
|
std r25, 88(SP)
|
||||||
std r19, 240(SP)
|
std r26, 96(SP)
|
||||||
std r18, 248(SP)
|
std r27, 104(SP)
|
||||||
std r17, 256(SP)
|
std r28, 112(SP)
|
||||||
std r16, 264(SP)
|
std r29, 120(SP)
|
||||||
std r15, 272(SP)
|
std r30, 128(SP)
|
||||||
std r14, 280(SP)
|
std r31, 136(SP)
|
||||||
addi r11, SP ,288
|
|
||||||
stvx v20, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v21, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v22, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v23, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v24, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v25, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v26, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v27, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v28, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v29, r11, r0
|
|
||||||
addi r11, r11, 16
|
|
||||||
stvx v30, r11, r0
|
|
||||||
addi r11, r11 ,16
|
|
||||||
stvx v31, r11, r0
|
|
||||||
li r11,0
|
|
||||||
|
|
||||||
cmpwi cr0, M, 0
|
cmpwi cr0, M, 0
|
||||||
ble- L999
|
ble- L999
|
||||||
|
@ -204,49 +180,24 @@ L999:
|
||||||
|
|
||||||
li r3, 0
|
li r3, 0
|
||||||
|
|
||||||
ld r31, 144(SP)
|
ld r14, 0(SP)
|
||||||
ld r30, 152(SP)
|
ld r15, 8(SP)
|
||||||
ld r29, 160(SP)
|
ld r16, 16(SP)
|
||||||
ld r28, 168(SP)
|
ld r17, 24(SP)
|
||||||
ld r27, 176(SP)
|
ld r18, 32(SP)
|
||||||
ld r26, 184(SP)
|
ld r19, 40(SP)
|
||||||
ld r25, 192(SP)
|
ld r20, 48(SP)
|
||||||
ld r24, 200(SP)
|
ld r21, 56(SP)
|
||||||
ld r23, 208(SP)
|
ld r22, 64(SP)
|
||||||
ld r22, 216(SP)
|
ld r23, 72(SP)
|
||||||
ld r21, 224(SP)
|
ld r24, 80(SP)
|
||||||
ld r20, 232(SP)
|
ld r25, 88(SP)
|
||||||
ld r19, 240(SP)
|
ld r26, 96(SP)
|
||||||
ld r18, 248(SP)
|
ld r27, 104(SP)
|
||||||
ld r17, 256(SP)
|
ld r28, 112(SP)
|
||||||
ld r16, 264(SP)
|
ld r29, 120(SP)
|
||||||
ld r15, 272(SP)
|
ld r30, 128(SP)
|
||||||
ld r14, 280(SP)
|
ld r31, 136(SP)
|
||||||
addi r11, SP, 288
|
|
||||||
lvx v20, r11,r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v21, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v22, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v23, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v24, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v25, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v26, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v27, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v28, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v29, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v30, r11, r3
|
|
||||||
addi r11, r11, 16
|
|
||||||
lvx v31, r11, r3
|
|
||||||
li r11,0
|
|
||||||
|
|
||||||
addi SP, SP, STACKSIZE
|
addi SP, SP, STACKSIZE
|
||||||
blr
|
blr
|
||||||
|
|
|
@ -72,23 +72,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
lxvd2x vs51, o48, A2
|
lxvd2x vs51, o48, A2
|
||||||
addi A2, A2, 64
|
addi A2, A2, 64
|
||||||
|
|
||||||
lxvd2x vs52, o0, A2
|
lxvd2x vs2, o0, A2
|
||||||
lxvd2x vs53, o16, A2
|
lxvd2x vs3, o16, A2
|
||||||
lxvd2x vs54, o32, A2
|
lxvd2x vs4, o32, A2
|
||||||
lxvd2x vs55, o48, A2
|
lxvd2x vs5, o48, A2
|
||||||
addi A2, A2, 64
|
addi A2, A2, 64
|
||||||
|
|
||||||
|
|
||||||
lxvd2x vs56, o0, A3
|
lxvd2x vs6, o0, A3
|
||||||
lxvd2x vs57, o16, A3
|
lxvd2x vs7, o16, A3
|
||||||
lxvd2x vs58, o32, A3
|
lxvd2x vs8, o32, A3
|
||||||
lxvd2x vs59, o48, A3
|
lxvd2x vs9, o48, A3
|
||||||
addi A3, A3, 64
|
addi A3, A3, 64
|
||||||
|
|
||||||
lxvd2x vs60, o0, A3
|
lxvd2x vs10, o0, A3
|
||||||
lxvd2x vs61, o16, A3
|
lxvd2x vs11, o16, A3
|
||||||
lxvd2x vs62, o32, A3
|
lxvd2x vs12, o32, A3
|
||||||
lxvd2x vs63, o48, A3
|
lxvd2x vs13, o48, A3
|
||||||
addi A3, A3, 64
|
addi A3, A3, 64
|
||||||
|
|
||||||
|
|
||||||
|
@ -126,23 +126,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
stxvd2x vs51, o48, T1
|
stxvd2x vs51, o48, T1
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs52, o0, T1
|
stxvd2x vs2, o0, T1
|
||||||
stxvd2x vs53, o16, T1
|
stxvd2x vs3, o16, T1
|
||||||
stxvd2x vs54, o32, T1
|
stxvd2x vs4, o32, T1
|
||||||
stxvd2x vs55, o48, T1
|
stxvd2x vs5, o48, T1
|
||||||
|
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs56, o0, T1
|
stxvd2x vs6, o0, T1
|
||||||
stxvd2x vs57, o16, T1
|
stxvd2x vs7, o16, T1
|
||||||
stxvd2x vs58, o32, T1
|
stxvd2x vs8, o32, T1
|
||||||
stxvd2x vs59, o48, T1
|
stxvd2x vs9, o48, T1
|
||||||
addi T1, T1, 64
|
addi T1, T1, 64
|
||||||
|
|
||||||
stxvd2x vs60, o0, T1
|
stxvd2x vs10, o0, T1
|
||||||
stxvd2x vs61, o16, T1
|
stxvd2x vs11, o16, T1
|
||||||
stxvd2x vs62, o32, T1
|
stxvd2x vs12, o32, T1
|
||||||
stxvd2x vs63, o48, T1
|
stxvd2x vs13, o48, T1
|
||||||
|
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
|
|
|
@ -27,8 +27,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
#ifdef Z13_A
|
||||||
|
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||||
|
{
|
||||||
|
|
||||||
|
|
||||||
|
__asm__ ("pfd 2, 0(%[x_ptr]) \n\t"
|
||||||
|
"lgdr %%r0,%[alpha] \n\t"
|
||||||
|
"vlvgp %%v0,%%r0,%%r0 \n\t"
|
||||||
|
"srlg %[n],%[n],4 \n\t"
|
||||||
|
"vlr %%v1,%%v0 \n\t"
|
||||||
|
"vlm %%v16,%%v23, 0(%[x_ptr]) \n\t"
|
||||||
|
"la %[x_ptr], 128(%[x_ptr]) \n\t"
|
||||||
|
"aghik %[n], %[n], -1 \n\t"
|
||||||
|
"jle 2f \n\t"
|
||||||
|
".align 16 \n\t"
|
||||||
|
"1: \n\t"
|
||||||
|
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||||
|
"vfmdb %%v25, %%v17, %%v0 \n\t"
|
||||||
|
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||||
|
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||||
|
"vlm %%v16,%%v19, 0(%[x_ptr]) \n\t"
|
||||||
|
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||||
|
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||||
|
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||||
|
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||||
|
"vlm %%v20,%%v23, 64(%[x_ptr]) \n\t"
|
||||||
|
"lay %[x_ptr], -128(%[x_ptr]) \n\t"
|
||||||
|
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||||
|
"la %[x_ptr],256(%[x_ptr]) \n\t"
|
||||||
|
"brctg %[n],1b \n\t"
|
||||||
|
"2: \n\t"
|
||||||
|
"vfmdb %%v24, %%v16, %%v0 \n\t"
|
||||||
|
"vfmdb %%v25, %%v17, %%v1 \n\t"
|
||||||
|
"vfmdb %%v26, %%v18, %%v0 \n\t"
|
||||||
|
"vfmdb %%v27, %%v19, %%v1 \n\t"
|
||||||
|
"lay %[x_ptr] , -128(%[x_ptr]) \n\t"
|
||||||
|
"vfmdb %%v28, %%v20, %%v0 \n\t"
|
||||||
|
"vfmdb %%v29, %%v21, %%v1 \n\t"
|
||||||
|
"vfmdb %%v30, %%v22, %%v0 \n\t"
|
||||||
|
"vfmdb %%v31, %%v23, %%v1 \n\t"
|
||||||
|
"vstm %%v24,%%v31, 0(%[x_ptr]) \n\t"
|
||||||
|
: [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n)
|
||||||
|
: [alpha] "f"(da)
|
||||||
|
:"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21",
|
||||||
|
"v22","v23","v24","v25","v26","v27","v28","v29","v30","v31"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#else
|
||||||
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -71,7 +117,7 @@ static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x )
|
||||||
);
|
);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
|
static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x )
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -214,6 +260,4 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue