551 lines
7.4 KiB
ArmAsm
551 lines
7.4 KiB
ArmAsm
#define ASSEMBLER
|
|
#include "common.h"
|
|
|
|
/*
|
|
.file "caxpy.c"
|
|
.abiversion 2
|
|
.section ".text"
|
|
.align 2
|
|
.p2align 4,,15
|
|
.globl caxpy_k
|
|
.type caxpy_k, @function
|
|
*/
|
|
|
|
PROLOGUE
|
|
|
|
#ifdef CONJ
|
|
caxpyc_k:
|
|
#else
|
|
caxpy_k:
|
|
#endif
|
|
.LCF0:
|
|
0: addis 2,12,.TOC.-.LCF0@ha
|
|
addi 2,2,.TOC.-.LCF0@l
|
|
#ifdef CONJ
|
|
.localentry caxpyc_k,.-caxpyc_k
|
|
#else
|
|
.localentry caxpy_k,.-caxpy_k
|
|
#endif
|
|
mr. 7,3
|
|
ble 0,.L33
|
|
cmpdi 7,9,1
|
|
beq 7,.L37
|
|
.L3:
|
|
mtctr 7
|
|
ld 7,96(1)
|
|
sldi 9,9,3
|
|
sldi 7,7,3
|
|
.p2align 4,,15
|
|
.L14:
|
|
lfs 10,4(8)
|
|
lfs 11,0(8)
|
|
lfs 12,0(10)
|
|
lfs 0,4(10)
|
|
fmuls 10,2,10
|
|
#ifdef CONJ
|
|
fmadds 11,11,1,10
|
|
#else
|
|
fmsubs 11,11,1,10
|
|
#endif
|
|
fadds 12,12,11
|
|
stfs 12,0(10)
|
|
lfs 11,0(8)
|
|
lfs 12,4(8)
|
|
add 8,8,9
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,12,1,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,12,1,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfs 0,4(10)
|
|
add 10,10,7
|
|
bdnz .L14
|
|
.L33:
|
|
li 3,0
|
|
blr
|
|
.p2align 4,,15
|
|
.L37:
|
|
ld 6,96(1)
|
|
cmpdi 7,6,1
|
|
bne 7,.L3
|
|
rldicr. 4,7,0,59
|
|
li 11,0
|
|
bne 0,.L38
|
|
.L4:
|
|
addi 6,11,8
|
|
subf 0,4,7
|
|
sldi 6,6,2
|
|
addi 9,6,-32
|
|
add 5,10,6
|
|
add 6,8,6
|
|
add 3,8,9
|
|
add 9,10,9
|
|
subfc 5,5,3
|
|
subfe 5,5,5
|
|
subfc 6,6,9
|
|
subfe 12,12,12
|
|
addi 6,5,1
|
|
addi 5,12,1
|
|
or 6,6,5
|
|
rlwinm 6,6,0,0xff
|
|
cmpwi 7,6,0
|
|
beq 7,.L7
|
|
sradi 6,4,63
|
|
srdi 5,7,63
|
|
subfc 12,7,4
|
|
adde 6,5,6
|
|
subfic 12,0,4
|
|
subfe 12,12,12
|
|
xori 6,6,0x1
|
|
neg 12,12
|
|
and 6,6,12
|
|
rlwinm 6,6,0,0xff
|
|
cmpwi 7,6,0
|
|
beq 7,.L7
|
|
cmpd 7,4,7
|
|
li 6,1
|
|
blt 7,.L39
|
|
.L9:
|
|
addi 0,7,-1
|
|
subf 0,4,0
|
|
subfic 0,0,3
|
|
subfe 12,12,12
|
|
addi 0,12,1
|
|
rlwinm 0,0,0,0xff
|
|
cmpwi 7,0,0
|
|
bne 7,.L10
|
|
sradi 0,4,63
|
|
subfc 12,7,4
|
|
adde 5,5,0
|
|
rlwinm 5,5,0,0xff
|
|
cmpwi 7,5,0
|
|
bne 7,.L10
|
|
xscvdpspn 0,1
|
|
xscvdpspn 12,2
|
|
addi 0,6,-1
|
|
std 31,-8(1)
|
|
addis 12,2,.LC2@toc@ha
|
|
addis 6,2,.LC3@toc@ha
|
|
li 5,16
|
|
srdi. 31,0,2
|
|
addi 6,6,.LC3@toc@l
|
|
addi 12,12,.LC2@toc@l
|
|
mtctr 31
|
|
lxv 41,0(6)
|
|
lxv 42,0(12)
|
|
li 6,0
|
|
xxspltw 0,0,0
|
|
xxspltw 12,12,0
|
|
beq 0,.L40
|
|
.p2align 4,,15
|
|
.L11:
|
|
#ifdef CONJ
|
|
lxvx 33,3,5
|
|
lxvx 44,3,6
|
|
lxvx 43,9,6
|
|
lxvx 32,9,5
|
|
vperm 13,1,12,10
|
|
vperm 12,1,12,9
|
|
vperm 8,0,11,10
|
|
vperm 0,0,11,9
|
|
xvmulsp 33,12,44
|
|
xvmulsp 11,12,45
|
|
xvmaddasp 33,0,45
|
|
xvmsubmsp 44,0,11
|
|
xvaddsp 33,33,40
|
|
xvsubsp 32,32,44
|
|
#else
|
|
lxvx 33,3,6
|
|
lxvx 32,3,5
|
|
lxvx 43,9,6
|
|
lxvx 44,9,5
|
|
vperm 13,0,1,10
|
|
vperm 0,0,1,9
|
|
vperm 8,12,11,10
|
|
vperm 12,12,11,9
|
|
xvmulsp 33,12,32
|
|
xvmulsp 11,12,45
|
|
xvmsubasp 33,0,45
|
|
xvmaddmsp 32,0,11
|
|
xvaddsp 33,33,40
|
|
xvaddsp 32,32,44
|
|
#endif
|
|
vmrglw 13,0,1
|
|
vmrghw 0,0,1
|
|
stxvx 45,9,6
|
|
stxvx 32,9,5
|
|
addi 6,6,32
|
|
addi 5,5,32
|
|
bdnz .L11
|
|
rldicr 0,0,0,61
|
|
ld 31,-8(1)
|
|
sldi 9,0,1
|
|
add 4,4,0
|
|
add 11,11,9
|
|
.L10:
|
|
sldi 5,11,2
|
|
addi 6,4,1
|
|
addi 9,11,2
|
|
addi 3,5,4
|
|
lfsx 12,8,5
|
|
cmpd 7,7,6
|
|
lfsx 0,10,5
|
|
lfsx 11,8,3
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,12,1,11
|
|
#else
|
|
fmsubs 12,12,1,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,5
|
|
lfsx 11,8,5
|
|
lfsx 12,8,3
|
|
lfsx 0,10,3
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,12,1,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,12,1,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,3
|
|
ble 7,.L33
|
|
sldi 9,9,2
|
|
addi 5,4,2
|
|
addi 6,11,4
|
|
addi 3,9,4
|
|
lfsx 12,8,9
|
|
cmpd 7,7,5
|
|
lfsx 0,10,9
|
|
lfsx 11,8,3
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,9
|
|
lfsx 11,8,9
|
|
lfsx 12,8,3
|
|
lfsx 0,10,3
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,3
|
|
ble 7,.L33
|
|
sldi 6,6,2
|
|
addi 4,4,3
|
|
addi 9,11,6
|
|
addi 5,6,4
|
|
lfsx 12,8,6
|
|
cmpd 7,7,4
|
|
lfsx 0,10,6
|
|
lfsx 11,8,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,6
|
|
lfsx 11,8,6
|
|
lfsx 12,8,5
|
|
lfsx 0,10,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,5
|
|
ble 7,.L33
|
|
sldi 9,9,2
|
|
addi 7,9,4
|
|
lfsx 12,8,9
|
|
lfsx 0,10,9
|
|
lfsx 11,8,7
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,9
|
|
lfsx 11,8,9
|
|
lfsx 12,8,7
|
|
lfsx 0,10,7
|
|
fmuls 2,2,11
|
|
#ifdef CONJ
|
|
fmsubs 1,1,12,2
|
|
fsubs 1,0,1
|
|
#else
|
|
fmadds 1,1,12,2
|
|
fadds 1,0,1
|
|
#endif
|
|
stfsx 1,10,7
|
|
b .L33
|
|
.L39:
|
|
mr 6,0
|
|
b .L9
|
|
.L38:
|
|
#ifdef CONJ
|
|
fneg 0,1
|
|
xxpermdi 45,1,1,0
|
|
xscvdpspn 12,2
|
|
addis 9,2,.LANCHOR0@toc@ha
|
|
sradi. 3,4,1
|
|
xxpermdi 44,0,0,0
|
|
addi 9,9,.LANCHOR0@toc@l
|
|
xvcvdpsp 45,45
|
|
lxv 33,0(9)
|
|
xvcvdpsp 32,44
|
|
xxspltw 12,12,0
|
|
#else
|
|
fneg 12,2
|
|
xxpermdi 32,2,2,0
|
|
xscvdpspn 0,1
|
|
addis 9,2,.LANCHOR0@toc@ha
|
|
sradi. 3,4,1
|
|
xxpermdi 45,12,12,0
|
|
addi 9,9,.LANCHOR0@toc@l
|
|
xvcvdpsp 32,32
|
|
lxv 33,0(9)
|
|
xvcvdpsp 45,45
|
|
xxspltw 0,0,0
|
|
#endif
|
|
vmrgew 0,0,13
|
|
beq 0,.L5
|
|
mr 6,8
|
|
mr 9,10
|
|
li 5,0
|
|
.p2align 4,,15
|
|
.L6:
|
|
lxv 38,16(6)
|
|
lxv 11,16(9)
|
|
addi 5,5,8
|
|
addi 6,6,128
|
|
addi 9,9,128
|
|
lxv 39,-96(6)
|
|
lxv 40,-80(6)
|
|
lxv 41,-64(6)
|
|
lxv 42,-48(6)
|
|
cmpd 7,3,5
|
|
lxv 43,-32(6)
|
|
lxv 45,-128(6)
|
|
lxv 44,-16(6)
|
|
#ifdef CONJ
|
|
lxv 0,-128(9)
|
|
vpermr 17,6,6,1
|
|
xvmaddmsp 38,32,11
|
|
lxv 11,-96(9)
|
|
vpermr 18,7,7,1
|
|
vpermr 19,8,8,1
|
|
vpermr 2,9,9,1
|
|
vpermr 3,10,10,1
|
|
vpermr 4,11,11,1
|
|
xvmaddasp 0,32,45
|
|
vpermr 5,12,12,1
|
|
xvmaddmsp 39,32,11
|
|
lxv 11,-80(9)
|
|
vpermr 13,13,13,1
|
|
xvmaddasp 38,12,49
|
|
xvmaddmsp 40,32,11
|
|
lxv 11,-64(9)
|
|
xvmaddmsp 45,12,0
|
|
xvmaddasp 39,12,50
|
|
stxv 38,-112(9)
|
|
xvmaddmsp 41,32,11
|
|
lxv 11,-48(9)
|
|
xvmaddasp 40,12,51
|
|
stxv 45,-128(9)
|
|
stxv 39,-96(9)
|
|
xvmaddmsp 42,32,11
|
|
lxv 11,-32(9)
|
|
xvmaddasp 41,12,34
|
|
stxv 40,-80(9)
|
|
xvmaddmsp 43,32,11
|
|
lxv 11,-16(9)
|
|
xvmaddasp 42,12,35
|
|
stxv 41,-64(9)
|
|
xvmaddmsp 44,32,11
|
|
xvmaddasp 43,12,36
|
|
stxv 42,-48(9)
|
|
xvmaddasp 44,12,37
|
|
#else
|
|
lxv 12,-128(9)
|
|
vpermr 17,6,6,1
|
|
xvmaddmsp 38,0,11
|
|
lxv 11,-96(9)
|
|
vpermr 18,7,7,1
|
|
vpermr 19,8,8,1
|
|
vpermr 2,9,9,1
|
|
vpermr 3,10,10,1
|
|
vpermr 4,11,11,1
|
|
xvmaddasp 12,0,45
|
|
vpermr 5,12,12,1
|
|
xvmaddmsp 39,0,11
|
|
lxv 11,-80(9)
|
|
vpermr 13,13,13,1
|
|
xvmaddasp 38,32,49
|
|
xvmaddmsp 40,0,11
|
|
lxv 11,-64(9)
|
|
xvmaddmsp 45,32,12
|
|
xvmaddasp 39,32,50
|
|
stxv 38,-112(9)
|
|
xvmaddmsp 41,0,11
|
|
lxv 11,-48(9)
|
|
xvmaddasp 40,32,51
|
|
stxv 45,-128(9)
|
|
stxv 39,-96(9)
|
|
xvmaddmsp 42,0,11
|
|
lxv 11,-32(9)
|
|
xvmaddasp 41,32,34
|
|
stxv 40,-80(9)
|
|
xvmaddmsp 43,0,11
|
|
lxv 11,-16(9)
|
|
xvmaddasp 42,32,35
|
|
stxv 41,-64(9)
|
|
xvmaddmsp 44,0,11
|
|
xvmaddasp 43,32,36
|
|
stxv 42,-48(9)
|
|
xvmaddasp 44,32,37
|
|
#endif
|
|
stxv 43,-32(9)
|
|
stxv 44,-16(9)
|
|
bgt 7,.L6
|
|
.L5:
|
|
cmpd 7,7,4
|
|
ble 7,.L33
|
|
sldi 11,4,1
|
|
b .L4
|
|
.L7:
|
|
addi 10,4,1
|
|
subf 8,4,7
|
|
cmpd 7,10,7
|
|
mtctr 8
|
|
bgt 7,.L26
|
|
li 10,-1
|
|
rldicr 10,10,0,0
|
|
cmpd 7,7,10
|
|
beq 7,.L26
|
|
.p2align 4,,15
|
|
.L13:
|
|
lfs 10,4(3)
|
|
lfs 11,0(3)
|
|
lfs 12,0(9)
|
|
lfs 0,4(9)
|
|
addi 3,3,8
|
|
addi 9,9,8
|
|
fmuls 10,2,10
|
|
#ifdef CONJ
|
|
fmadds 11,1,11,10
|
|
#else
|
|
fmsubs 11,1,11,10
|
|
#endif
|
|
fadds 12,12,11
|
|
stfs 12,-8(9)
|
|
lfs 11,-8(3)
|
|
lfs 12,-4(3)
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfs 0,-4(9)
|
|
bdnz .L13
|
|
b .L33
|
|
.L40:
|
|
li 31,1
|
|
mtctr 31
|
|
b .L11
|
|
.L26:
|
|
li 10,1
|
|
mtctr 10
|
|
b .L13
|
|
.long 0
|
|
.byte 0,0,0,0,0,1,0,0
|
|
#ifdef CONJ
|
|
.size caxpyc_k,.-caxpyc_k
|
|
#else
|
|
.size caxpy_k,.-caxpy_k
|
|
#endif
|
|
.section .rodata
|
|
.align 4
|
|
.set .LANCHOR0,. + 0
|
|
.type swap_mask_arr, @object
|
|
.size swap_mask_arr, 16
|
|
swap_mask_arr:
|
|
.byte 4
|
|
.byte 5
|
|
.byte 6
|
|
.byte 7
|
|
.byte 0
|
|
.byte 1
|
|
.byte 2
|
|
.byte 3
|
|
.byte 12
|
|
.byte 13
|
|
.byte 14
|
|
.byte 15
|
|
.byte 8
|
|
.byte 9
|
|
.byte 10
|
|
.byte 11
|
|
.section .rodata.cst16,"aM",@progbits,16
|
|
.align 4
|
|
.LC2:
|
|
.byte 31
|
|
.byte 30
|
|
.byte 29
|
|
.byte 28
|
|
.byte 23
|
|
.byte 22
|
|
.byte 21
|
|
.byte 20
|
|
.byte 15
|
|
.byte 14
|
|
.byte 13
|
|
.byte 12
|
|
.byte 7
|
|
.byte 6
|
|
.byte 5
|
|
.byte 4
|
|
.LC3:
|
|
.byte 27
|
|
.byte 26
|
|
.byte 25
|
|
.byte 24
|
|
.byte 19
|
|
.byte 18
|
|
.byte 17
|
|
.byte 16
|
|
.byte 11
|
|
.byte 10
|
|
.byte 9
|
|
.byte 8
|
|
.byte 3
|
|
.byte 2
|
|
.byte 1
|
|
.byte 0
|
|
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
|
.gnu_attribute 4, 1
|
|
.section .note.GNU-stack,"",@progbits
|