598 lines
8.2 KiB
ArmAsm
598 lines
8.2 KiB
ArmAsm
#define ASSEMBLER
|
|
#include "common.h"
|
|
/*
|
|
.file "caxpy.c"
|
|
.abiversion 2
|
|
.section ".text"
|
|
.align 2
|
|
.p2align 4,,15
|
|
.globl caxpy_k
|
|
.type caxpy_k, @function
|
|
*/
|
|
|
|
PROLOGUE
|
|
|
|
#if _CALL_ELF ==2
|
|
#ifdef CONJ
|
|
caxpyc_k:
|
|
#else
|
|
caxpy_k:
|
|
#endif
|
|
#endif
|
|
.LCF0:
|
|
0: addis 2,12,.TOC.-.LCF0@ha
|
|
addi 2,2,.TOC.-.LCF0@l
|
|
#if _CALL_ELF ==2
|
|
#ifdef CONJ
|
|
.localentry caxpyc_k,.-caxpyc_k
|
|
#else
|
|
.localentry caxpy_k,.-caxpy_k
|
|
#endif
|
|
#endif
|
|
mr. 7,3
|
|
ble 0,.L33
|
|
cmpdi 7,9,1
|
|
beq 7,.L41
|
|
.L3:
|
|
mtctr 7
|
|
ld 7,96(1)
|
|
sldi 9,9,3
|
|
sldi 7,7,3
|
|
.p2align 4,,15
|
|
.L14:
|
|
lfs 10,4(8)
|
|
lfs 11,0(8)
|
|
lfs 12,0(10)
|
|
lfs 0,4(10)
|
|
fmuls 10,2,10
|
|
#ifdef CONJ
|
|
fmadds 11,11,1,10
|
|
#else
|
|
fmsubs 11,11,1,10
|
|
#endif
|
|
fadds 12,12,11
|
|
stfs 12,0(10)
|
|
lfs 11,0(8)
|
|
lfs 12,4(8)
|
|
add 8,8,9
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,12,1,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,12,1,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfs 0,4(10)
|
|
add 10,10,7
|
|
bdnz .L14
|
|
.L33:
|
|
li 3,0
|
|
blr
|
|
.p2align 4,,15
|
|
.L41:
|
|
ld 6,96(1)
|
|
cmpdi 7,6,1
|
|
bne 7,.L3
|
|
rldicr. 4,7,0,59
|
|
std 31,-8(1)
|
|
li 11,0
|
|
bne 0,.L42
|
|
.L4:
|
|
addi 6,11,8
|
|
subf 0,4,7
|
|
sldi 6,6,2
|
|
addi 9,6,-32
|
|
add 5,10,6
|
|
add 3,8,9
|
|
add 6,8,6
|
|
subfc 5,5,3
|
|
add 9,10,9
|
|
subfe 5,5,5
|
|
subfc 6,6,9
|
|
subfe 31,31,31
|
|
addi 6,5,1
|
|
addi 5,31,1
|
|
or 6,6,5
|
|
rlwinm 6,6,0,0xff
|
|
cmpwi 7,6,0
|
|
beq 7,.L7
|
|
sradi 6,4,63
|
|
srdi 5,7,63
|
|
subfc 31,7,4
|
|
adde 6,5,6
|
|
subfic 31,0,3
|
|
subfe 31,31,31
|
|
xori 6,6,0x1
|
|
neg 31,31
|
|
and 6,6,31
|
|
rlwinm 6,6,0,0xff
|
|
cmpwi 7,6,0
|
|
beq 7,.L7
|
|
cmpd 7,4,7
|
|
li 6,1
|
|
blt 7,.L43
|
|
.L9:
|
|
addi 0,7,-1
|
|
subf 0,4,0
|
|
subfic 0,0,3
|
|
subfe 31,31,31
|
|
addi 0,31,1
|
|
rlwinm 0,0,0,0xff
|
|
cmpwi 7,0,0
|
|
bne 7,.L10
|
|
sradi 0,4,63
|
|
subfc 31,7,4
|
|
adde 5,5,0
|
|
rlwinm 5,5,0,0xff
|
|
cmpwi 7,5,0
|
|
bne 7,.L10
|
|
addi 0,6,-1
|
|
addis 31,2,.LC3@toc@ha
|
|
std 30,-16(1)
|
|
xscvdpspn 12,1
|
|
xscvdpspn 11,2
|
|
srdi. 30,0,2
|
|
addis 6,2,.LC2@toc@ha
|
|
addi 6,6,.LC2@toc@l
|
|
mtctr 30
|
|
addi 31,31,.LC3@toc@l
|
|
lxvd2x 42,0,6
|
|
li 5,16
|
|
li 6,0
|
|
lxvd2x 41,0,31
|
|
xxspltw 12,12,0
|
|
xxspltw 11,11,0
|
|
xxpermdi 42,42,42,2
|
|
xxpermdi 41,41,41,2
|
|
beq 0,.L44
|
|
.p2align 4,,15
|
|
.L11:
|
|
#ifdef CONJ
|
|
lxvd2x 44,3,6
|
|
lxvd2x 45,3,5
|
|
lxvd2x 33,9,6
|
|
lxvd2x 0,9,5
|
|
xxpermdi 44,44,44,2
|
|
xxpermdi 45,45,45,2
|
|
xxpermdi 32,33,33,2
|
|
xxpermdi 33,0,0,2
|
|
vperm 11,13,12,10
|
|
vperm 13,13,12,9
|
|
vperm 12,1,0,10
|
|
vperm 1,1,0,9
|
|
xvmulsp 0,11,43
|
|
xvmulsp 32,11,45
|
|
xvmsubmsp 45,12,0
|
|
xvmaddasp 32,12,43
|
|
xvaddsp 44,32,44
|
|
xvsubsp 32,33,45
|
|
vmrglw 1,0,12
|
|
vmrghw 0,0,12
|
|
#else
|
|
lxvd2x 45,3,6
|
|
lxvd2x 33,3,5
|
|
lxvd2x 43,9,6
|
|
lxvd2x 0,9,5
|
|
xxpermdi 45,45,45,2
|
|
xxpermdi 33,33,33,2
|
|
xxpermdi 32,43,43,2
|
|
xxpermdi 43,0,0,2
|
|
vperm 12,1,13,10
|
|
vperm 1,1,13,9
|
|
vperm 13,11,0,10
|
|
vperm 11,11,0,9
|
|
xvmulsp 0,11,44
|
|
xvmulsp 32,11,33
|
|
xvmaddmsp 33,12,0
|
|
xvmsubasp 32,12,44
|
|
xvaddsp 45,32,45
|
|
xvaddsp 32,33,43
|
|
vmrglw 1,0,13
|
|
vmrghw 0,0,13
|
|
#endif
|
|
xxpermdi 0,33,33,2
|
|
xxpermdi 32,32,32,2
|
|
stxvd2x 0,9,6
|
|
addi 6,6,32
|
|
stxvd2x 32,9,5
|
|
addi 5,5,32
|
|
bdnz .L11
|
|
rldicr 0,0,0,61
|
|
ld 30,-16(1)
|
|
sldi 9,0,1
|
|
add 4,4,0
|
|
add 11,11,9
|
|
.L10:
|
|
sldi 6,11,2
|
|
addi 9,4,1
|
|
addi 5,6,4
|
|
cmpd 7,7,9
|
|
lfsx 12,8,6
|
|
lfsx 0,10,6
|
|
addi 9,11,2
|
|
lfsx 11,8,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,12,1,11
|
|
#else
|
|
fmsubs 12,12,1,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,6
|
|
lfsx 11,8,6
|
|
lfsx 12,8,5
|
|
lfsx 0,10,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,12,1,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,12,1,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,5
|
|
ble 7,.L39
|
|
sldi 9,9,2
|
|
addi 6,4,2
|
|
addi 5,9,4
|
|
cmpd 7,7,6
|
|
lfsx 12,8,9
|
|
lfsx 0,10,9
|
|
addi 6,11,4
|
|
lfsx 11,8,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,9
|
|
lfsx 11,8,9
|
|
lfsx 12,8,5
|
|
lfsx 0,10,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,5
|
|
ble 7,.L39
|
|
sldi 6,6,2
|
|
addi 4,4,3
|
|
addi 5,6,4
|
|
cmpd 7,7,4
|
|
lfsx 12,8,6
|
|
lfsx 0,10,6
|
|
addi 9,11,6
|
|
lfsx 11,8,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,6
|
|
lfsx 11,8,6
|
|
lfsx 12,8,5
|
|
lfsx 0,10,5
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfsx 0,10,5
|
|
ble 7,.L39
|
|
sldi 9,9,2
|
|
ld 31,-8(1)
|
|
addi 7,9,4
|
|
lfsx 12,8,9
|
|
lfsx 0,10,9
|
|
lfsx 11,8,7
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmadds 12,1,12,11
|
|
#else
|
|
fmsubs 12,1,12,11
|
|
#endif
|
|
fadds 0,0,12
|
|
stfsx 0,10,9
|
|
lfsx 11,8,9
|
|
lfsx 12,8,7
|
|
lfsx 0,10,7
|
|
fmuls 2,2,11
|
|
#ifdef CONJ
|
|
fmsubs 1,1,12,2
|
|
fsubs 1,0,1
|
|
#else
|
|
fmadds 1,1,12,2
|
|
fadds 1,0,1
|
|
#endif
|
|
stfsx 1,10,7
|
|
b .L33
|
|
.L43:
|
|
mr 6,0
|
|
b .L9
|
|
.L7:
|
|
addi 10,4,1
|
|
cmpd 7,10,7
|
|
subf 10,4,7
|
|
mtctr 10
|
|
bgt 7,.L26
|
|
li 10,-1
|
|
rldicr 10,10,0,0
|
|
cmpd 7,7,10
|
|
beq 7,.L26
|
|
.p2align 4,,15
|
|
.L13:
|
|
lfs 10,4(3)
|
|
lfs 11,0(3)
|
|
addi 9,9,8
|
|
addi 3,3,8
|
|
lfs 12,-8(9)
|
|
lfs 0,-4(9)
|
|
fmuls 10,2,10
|
|
#ifdef CONJ
|
|
fmadds 11,1,11,10
|
|
#else
|
|
fmsubs 11,1,11,10
|
|
#endif
|
|
fadds 12,12,11
|
|
stfs 12,-8(9)
|
|
lfs 11,-8(3)
|
|
lfs 12,-4(3)
|
|
fmuls 11,2,11
|
|
#ifdef CONJ
|
|
fmsubs 12,1,12,11
|
|
fsubs 0,0,12
|
|
#else
|
|
fmadds 12,1,12,11
|
|
fadds 0,0,12
|
|
#endif
|
|
stfs 0,-4(9)
|
|
bdnz .L13
|
|
.L39:
|
|
ld 31,-8(1)
|
|
b .L33
|
|
.L42:
|
|
#ifdef CONJ
|
|
fneg 0,1
|
|
xxpermdi 32,1,1,0
|
|
addis 9,2,.LANCHOR0@toc@ha
|
|
std 28,-32(1)
|
|
sradi. 28,4,1
|
|
addi 9,9,.LANCHOR0@toc@l
|
|
xscvdpspn 5,2
|
|
xvcvdpsp 32,32
|
|
lxvd2x 12,0,9
|
|
xxpermdi 39,0,0,0
|
|
xxspltw 5,5,0
|
|
xvcvdpsp 39,39
|
|
#else
|
|
fneg 0,2
|
|
xxpermdi 39,2,2,0
|
|
addis 9,2,.LANCHOR0@toc@ha
|
|
std 28,-32(1)
|
|
sradi. 28,4,1
|
|
addi 9,9,.LANCHOR0@toc@l
|
|
xscvdpspn 5,1
|
|
xvcvdpsp 39,39
|
|
lxvd2x 12,0,9
|
|
xxpermdi 32,0,0,0
|
|
xxspltw 5,5,0
|
|
xvcvdpsp 32,32
|
|
#endif
|
|
xxpermdi 12,12,12,2
|
|
vmrgew 7,7,0
|
|
beq 0,.L5
|
|
xxlnor 38,12,12
|
|
std 29,-24(1)
|
|
std 30,-16(1)
|
|
mr 6,8
|
|
mr 9,10
|
|
li 29,0
|
|
li 30,16
|
|
li 31,32
|
|
li 12,48
|
|
li 0,64
|
|
li 11,80
|
|
li 3,96
|
|
li 5,112
|
|
.p2align 4,,15
|
|
.L6:
|
|
lxvd2x 6,0,9
|
|
lxvd2x 40,0,6
|
|
addi 29,29,8
|
|
lxvd2x 41,6,30
|
|
lxvd2x 42,6,31
|
|
cmpd 7,28,29
|
|
lxvd2x 43,6,12
|
|
lxvd2x 44,6,0
|
|
lxvd2x 45,6,11
|
|
lxvd2x 33,6,3
|
|
lxvd2x 32,6,5
|
|
lxvd2x 7,9,30
|
|
addi 6,6,128
|
|
lxvd2x 8,9,31
|
|
lxvd2x 9,9,12
|
|
xxpermdi 40,40,40,2
|
|
xxpermdi 6,6,6,2
|
|
lxvd2x 10,9,0
|
|
lxvd2x 11,9,11
|
|
xxpermdi 41,41,41,2
|
|
xxpermdi 42,42,42,2
|
|
lxvd2x 12,9,3
|
|
lxvd2x 0,9,5
|
|
xxpermdi 43,43,43,2
|
|
xxpermdi 44,44,44,2
|
|
xxpermdi 45,45,45,2
|
|
xxpermdi 33,33,33,2
|
|
xxpermdi 32,32,32,2
|
|
xxpermdi 7,7,7,2
|
|
xxpermdi 8,8,8,2
|
|
xxpermdi 9,9,9,2
|
|
xxpermdi 10,10,10,2
|
|
xxpermdi 11,11,11,2
|
|
xxpermdi 12,12,12,2
|
|
xxpermdi 0,0,0,2
|
|
#ifndef CONJ
|
|
xvmaddasp 6,5,40
|
|
xvmaddasp 7,5,41
|
|
xvmaddasp 8,5,42
|
|
xvmaddasp 9,5,43
|
|
xvmaddasp 10,5,44
|
|
xvmaddasp 11,5,45
|
|
xvmaddasp 12,5,33
|
|
xvmaddasp 0,5,32
|
|
vperm 8,8,8,6
|
|
vperm 9,9,9,6
|
|
vperm 10,10,10,6
|
|
vperm 11,11,11,6
|
|
vperm 12,12,12,6
|
|
vperm 13,13,13,6
|
|
vperm 1,1,1,6
|
|
vperm 0,0,0,6
|
|
#endif
|
|
xvmaddasp 6,39,40
|
|
xvmaddasp 7,39,41
|
|
xvmaddasp 8,39,42
|
|
xvmaddasp 9,39,43
|
|
xvmaddasp 10,39,44
|
|
xvmaddasp 11,39,45
|
|
xvmaddasp 12,39,33
|
|
xvmaddasp 0,39,32
|
|
#ifdef CONJ
|
|
vperm 8,8,8,6
|
|
vperm 9,9,9,6
|
|
vperm 10,10,10,6
|
|
vperm 11,11,11,6
|
|
vperm 12,12,12,6
|
|
vperm 13,13,13,6
|
|
vperm 1,1,1,6
|
|
vperm 0,0,0,6
|
|
xvmaddasp 6,5,40
|
|
xvmaddasp 7,5,41
|
|
xvmaddasp 8,5,42
|
|
xvmaddasp 9,5,43
|
|
xvmaddasp 10,5,44
|
|
xvmaddasp 11,5,45
|
|
xvmaddasp 12,5,33
|
|
xvmaddasp 0,5,32
|
|
#endif
|
|
xxpermdi 6,6,6,2
|
|
xxpermdi 7,7,7,2
|
|
xxpermdi 8,8,8,2
|
|
xxpermdi 9,9,9,2
|
|
stxvd2x 6,0,9
|
|
xxpermdi 10,10,10,2
|
|
stxvd2x 7,9,30
|
|
xxpermdi 11,11,11,2
|
|
stxvd2x 8,9,31
|
|
xxpermdi 12,12,12,2
|
|
stxvd2x 9,9,12
|
|
xxpermdi 0,0,0,2
|
|
stxvd2x 10,9,0
|
|
stxvd2x 11,9,11
|
|
stxvd2x 12,9,3
|
|
stxvd2x 0,9,5
|
|
addi 9,9,128
|
|
bgt 7,.L6
|
|
ld 29,-24(1)
|
|
ld 30,-16(1)
|
|
.L5:
|
|
cmpd 7,7,4
|
|
ble 7,.L36
|
|
sldi 11,4,1
|
|
ld 28,-32(1)
|
|
b .L4
|
|
.L36:
|
|
ld 28,-32(1)
|
|
ld 31,-8(1)
|
|
b .L33
|
|
.L44:
|
|
li 31,1
|
|
mtctr 31
|
|
b .L11
|
|
.L26:
|
|
li 10,1
|
|
mtctr 10
|
|
b .L13
|
|
.long 0
|
|
.byte 0,0,0,0,0,4,0,0
|
|
#if _CALL_ELF ==2
|
|
#ifdef CONJ
|
|
.size caxpyc_k,.-caxpyc_k
|
|
#else
|
|
.size caxpy_k,.-caxpy_k
|
|
#endif
|
|
#endif
|
|
.section .rodata
|
|
.align 4
|
|
.set .LANCHOR0,. + 0
|
|
.type swap_mask_arr, @object
|
|
.size swap_mask_arr, 16
|
|
swap_mask_arr:
|
|
.byte 4
|
|
.byte 5
|
|
.byte 6
|
|
.byte 7
|
|
.byte 0
|
|
.byte 1
|
|
.byte 2
|
|
.byte 3
|
|
.byte 12
|
|
.byte 13
|
|
.byte 14
|
|
.byte 15
|
|
.byte 8
|
|
.byte 9
|
|
.byte 10
|
|
.byte 11
|
|
.section .rodata.cst16,"aM",@progbits,16
|
|
.align 4
|
|
.LC2:
|
|
.byte 31
|
|
.byte 30
|
|
.byte 29
|
|
.byte 28
|
|
.byte 23
|
|
.byte 22
|
|
.byte 21
|
|
.byte 20
|
|
.byte 15
|
|
.byte 14
|
|
.byte 13
|
|
.byte 12
|
|
.byte 7
|
|
.byte 6
|
|
.byte 5
|
|
.byte 4
|
|
.LC3:
|
|
.byte 27
|
|
.byte 26
|
|
.byte 25
|
|
.byte 24
|
|
.byte 19
|
|
.byte 18
|
|
.byte 17
|
|
.byte 16
|
|
.byte 11
|
|
.byte 10
|
|
.byte 9
|
|
.byte 8
|
|
.byte 3
|
|
.byte 2
|
|
.byte 1
|
|
.byte 0
|
|
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
|
|
.gnu_attribute 4, 1
|
|
.section .note.GNU-stack,"",@progbits
|