From 9e7e5c1185537c774099392b3d9a2948ff6b17e9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 19 Sep 2019 22:18:43 +0200 Subject: [PATCH] Add gcc7-generated assembler version of caxpy for power8 to work around wrong code generated by gcc 8.3 --- kernel/power/caxpy_power8.S | 460 ++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 kernel/power/caxpy_power8.S diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S new file mode 100644 index 000000000..02d694391 --- /dev/null +++ b/kernel/power/caxpy_power8.S @@ -0,0 +1,460 @@ + .file "caxpy.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl caxpy_k + .type caxpy_k, @function +caxpy_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry caxpy_k,.-caxpy_k + mr. 7,3 + ble 0,.L33 + cmpdi 7,9,1 + beq 7,.L41 +.L3: + mtctr 7 + ld 7,96(1) + sldi 9,9,3 + sldi 7,7,3 + .p2align 4,,15 +.L14: + lfs 10,4(8) + lfs 11,0(8) + lfs 12,0(10) + lfs 0,4(10) + fmuls 10,2,10 + fmsubs 11,11,1,10 + fadds 12,12,11 + stfs 12,0(10) + lfs 11,0(8) + lfs 12,4(8) + add 8,8,9 + fmuls 11,2,11 + fmadds 12,12,1,11 + fadds 0,0,12 + stfs 0,4(10) + add 10,10,7 + bdnz .L14 +.L33: + li 3,0 + blr + .p2align 4,,15 +.L41: + ld 6,96(1) + cmpdi 7,6,1 + bne 7,.L3 + rldicr. 4,7,0,59 + std 31,-8(1) + li 11,0 + bne 0,.L42 +.L4: + addi 6,11,8 + subf 0,4,7 + sldi 6,6,2 + addi 9,6,-32 + add 5,10,6 + add 3,8,9 + add 6,8,6 + subfc 5,5,3 + add 9,10,9 + subfe 5,5,5 + subfc 6,6,9 + subfe 31,31,31 + addi 6,5,1 + addi 5,31,1 + or 6,6,5 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + sradi 6,4,63 + srdi 5,7,63 + subfc 31,7,4 + adde 6,5,6 + subfic 31,0,3 + subfe 31,31,31 + xori 6,6,0x1 + neg 31,31 + and 6,6,31 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + cmpd 7,4,7 + li 6,1 + blt 7,.L43 +.L9: + addi 0,7,-1 + subf 0,4,0 + subfic 0,0,3 + subfe 31,31,31 + addi 0,31,1 + rlwinm 0,0,0,0xff + cmpwi 7,0,0 + bne 7,.L10 + sradi 0,4,63 + subfc 31,7,4 + adde 5,5,0 + rlwinm 5,5,0,0xff + cmpwi 7,5,0 + bne 7,.L10 + addi 0,6,-1 + addis 31,2,.LC3@toc@ha + std 30,-16(1) + xscvdpspn 12,1 + xscvdpspn 11,2 + srdi. 30,0,2 + addis 6,2,.LC2@toc@ha + addi 6,6,.LC2@toc@l + mtctr 30 + addi 31,31,.LC3@toc@l + lxvd2x 42,0,6 + li 5,16 + li 6,0 + lxvd2x 41,0,31 + xxspltw 12,12,0 + xxspltw 11,11,0 + xxpermdi 42,42,42,2 + xxpermdi 41,41,41,2 + beq 0,.L44 + .p2align 4,,15 +.L11: + lxvd2x 45,3,6 + lxvd2x 33,3,5 + lxvd2x 43,9,6 + lxvd2x 0,9,5 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,43,43,2 + xxpermdi 43,0,0,2 + vperm 12,1,13,10 + vperm 1,1,13,9 + vperm 13,11,0,10 + vperm 11,11,0,9 + xvmulsp 0,11,44 + xvmulsp 32,11,33 + xvmaddmsp 33,12,0 + xvmsubasp 32,12,44 + xvaddsp 45,32,45 + xvaddsp 32,33,43 + vmrglw 1,0,13 + vmrghw 0,0,13 + xxpermdi 0,33,33,2 + xxpermdi 32,32,32,2 + stxvd2x 0,9,6 + addi 6,6,32 + stxvd2x 32,9,5 + addi 5,5,32 + bdnz .L11 + rldicr 0,0,0,61 + ld 30,-16(1) + sldi 9,0,1 + add 4,4,0 + add 11,11,9 +.L10: + sldi 6,11,2 + addi 9,4,1 + addi 5,6,4 + cmpd 7,7,9 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,2 + lfsx 11,8,5 + fmuls 11,2,11 + fmsubs 12,12,1,11 + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 + fmadds 12,12,1,11 + fadds 0,0,12 + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + addi 6,4,2 + addi 5,9,4 + cmpd 7,7,6 + lfsx 12,8,9 + lfsx 0,10,9 + addi 6,11,4 + lfsx 11,8,5 + fmuls 11,2,11 + fmsubs 12,1,12,11 + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 + fmadds 12,1,12,11 + fadds 0,0,12 + stfsx 0,10,5 + ble 7,.L39 + sldi 6,6,2 + addi 4,4,3 + addi 5,6,4 + cmpd 7,7,4 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,6 + lfsx 11,8,5 + fmuls 11,2,11 + fmsubs 12,1,12,11 + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 + fmadds 12,1,12,11 + fadds 0,0,12 + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + ld 31,-8(1) + addi 7,9,4 + lfsx 12,8,9 + lfsx 0,10,9 + lfsx 11,8,7 + fmuls 11,2,11 + fmsubs 12,1,12,11 + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,7 + lfsx 0,10,7 + fmuls 2,2,11 + fmadds 1,1,12,2 + fadds 1,0,1 + stfsx 1,10,7 + b .L33 +.L43: + mr 6,0 + b .L9 +.L7: + addi 10,4,1 + cmpd 7,10,7 + subf 10,4,7 + mtctr 10 + bgt 7,.L26 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,7,10 + beq 7,.L26 + .p2align 4,,15 +.L13: + lfs 10,4(3) + lfs 11,0(3) + addi 9,9,8 + addi 3,3,8 + lfs 12,-8(9) + lfs 0,-4(9) + fmuls 10,2,10 + fmsubs 11,1,11,10 + fadds 12,12,11 + stfs 12,-8(9) + lfs 11,-8(3) + lfs 12,-4(3) + fmuls 11,2,11 + fmadds 12,1,12,11 + fadds 0,0,12 + stfs 0,-4(9) + bdnz .L13 +.L39: + ld 31,-8(1) + b .L33 +.L42: + fneg 0,2 + xxpermdi 39,2,2,0 + addis 9,2,.LANCHOR0@toc@ha + std 28,-32(1) + sradi. 28,4,1 + addi 9,9,.LANCHOR0@toc@l + xscvdpspn 5,1 + xvcvdpsp 39,39 + lxvd2x 12,0,9 + xxpermdi 32,0,0,0 + xxspltw 5,5,0 + xvcvdpsp 32,32 + xxpermdi 12,12,12,2 + vmrgew 7,7,0 + beq 0,.L5 + xxlnor 38,12,12 + std 29,-24(1) + std 30,-16(1) + mr 6,8 + mr 9,10 + li 29,0 + li 30,16 + li 31,32 + li 12,48 + li 0,64 + li 11,80 + li 3,96 + li 5,112 + .p2align 4,,15 +.L6: + lxvd2x 6,0,9 + lxvd2x 40,0,6 + addi 29,29,8 + lxvd2x 41,6,30 + lxvd2x 42,6,31 + cmpd 7,28,29 + lxvd2x 43,6,12 + lxvd2x 44,6,0 + lxvd2x 45,6,11 + lxvd2x 33,6,3 + lxvd2x 32,6,5 + lxvd2x 7,9,30 + addi 6,6,128 + lxvd2x 8,9,31 + lxvd2x 9,9,12 + xxpermdi 40,40,40,2 + xxpermdi 6,6,6,2 + lxvd2x 10,9,0 + lxvd2x 11,9,11 + xxpermdi 41,41,41,2 + xxpermdi 42,42,42,2 + lxvd2x 12,9,3 + lxvd2x 0,9,5 + xxpermdi 43,43,43,2 + xxpermdi 44,44,44,2 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,32,32,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + xxpermdi 10,10,10,2 + xxpermdi 11,11,11,2 + xxpermdi 12,12,12,2 + xxpermdi 0,0,0,2 + xvmaddasp 6,5,40 + xvmaddasp 7,5,41 + xvmaddasp 8,5,42 + xvmaddasp 9,5,43 + xvmaddasp 10,5,44 + xvmaddasp 11,5,45 + xvmaddasp 12,5,33 + xvmaddasp 0,5,32 + vperm 8,8,8,6 + vperm 9,9,9,6 + vperm 10,10,10,6 + vperm 11,11,11,6 + vperm 12,12,12,6 + vperm 13,13,13,6 + vperm 1,1,1,6 + vperm 0,0,0,6 + xvmaddasp 6,39,40 + xvmaddasp 7,39,41 + xvmaddasp 8,39,42 + xvmaddasp 9,39,43 + xvmaddasp 10,39,44 + xvmaddasp 11,39,45 + xvmaddasp 12,39,33 + xvmaddasp 0,39,32 + xxpermdi 6,6,6,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + stxvd2x 6,0,9 + xxpermdi 10,10,10,2 + stxvd2x 7,9,30 + xxpermdi 11,11,11,2 + stxvd2x 8,9,31 + xxpermdi 12,12,12,2 + stxvd2x 9,9,12 + xxpermdi 0,0,0,2 + stxvd2x 10,9,0 + stxvd2x 11,9,11 + stxvd2x 12,9,3 + stxvd2x 0,9,5 + addi 9,9,128 + bgt 7,.L6 + ld 29,-24(1) + ld 30,-16(1) +.L5: + cmpd 7,7,4 + ble 7,.L36 + sldi 11,4,1 + ld 28,-32(1) + b .L4 +.L36: + ld 28,-32(1) + ld 31,-8(1) + b .L33 +.L44: + li 31,1 + mtctr 31 + b .L11 +.L26: + li 10,1 + mtctr 10 + b .L13 + .long 0 + .byte 0,0,0,0,0,4,0,0 + .size caxpy_k,.-caxpy_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 7 + .byte 6 + .byte 5 + .byte 4 +.LC3: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .gnu_attribute 4, 1 + .section .note.GNU-stack,"",@progbits