Merge pull request #7 from xianyi/develop

update
This commit is contained in:
Martin Kroeker 2019-09-27 00:42:32 +02:00 committed by GitHub
commit c0d570a357
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 4762 additions and 33 deletions

View File

@ -17,7 +17,7 @@ matrix:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
script:
- set -e
- travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
- make -C test $COMMON_FLAGS $BTYPE
- make -C ctest $COMMON_FLAGS $BTYPE
- make -C utest $COMMON_FLAGS $BTYPE
@ -25,14 +25,14 @@ matrix:
- TARGET_BOX=LINUX64
- BTYPE="BINARY=64"
# - <<: *test-ubuntu
# os: linux-ppc64le
# before_script:
# - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
# env:
# # for matrix annotation only
# - TARGET_BOX=PPC64LE_LINUX
# - BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
os: linux-ppc64le
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
env:
# for matrix annotation only
- TARGET_BOX=PPC64LE_LINUX
- BTYPE="BINARY=64 USE_OPENMP=1"
- <<: *test-ubuntu
env:

View File

@ -322,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI
endif
ifeq ($(C_COMPILER), GCC)
#Test for supporting MS_ABI
#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1)
# GCC Majar version > 4
# GCC Major version > 4
# It is compatible with MSVC ABI.
CCOMMON_OPT += -DMS_ABI
endif
@ -554,8 +555,17 @@ endif
ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6
DYNAMIC_CORE += POWER8
ifneq ($(C_COMPILER), GCC)
DYNAMIC_CORE += POWER9
endif
ifeq ($(C_COMPILER), GCC)
ifeq ($(GCCVERSIONGT5), 1)
DYNAMIC_CORE += POWER9
else
$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
endif
endif
endif
# If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
ifndef DYNAMIC_CORE

View File

@ -206,6 +206,33 @@ void get_subdirname(void)
printf("arm64");
}
void get_cpucount(void)
{
int n=0;
#ifdef linux
FILE *infile;
char buffer[2048], *p,*t;
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if (!strncmp("processor", buffer, 9))
n++;
}
fclose(infile);
printf("#define NUM_CORES %d\n",n);
#endif
}
void get_cpuconfig(void)
{
@ -309,6 +336,7 @@ void get_cpuconfig(void)
printf("#define DTB_SIZE 4096 \n");
break;
}
get_cpucount();
}
@ -351,5 +379,3 @@ void get_features(void)
#endif
return;
}

View File

@ -3,7 +3,9 @@
extern gotoblas_t gotoblas_POWER6;
extern gotoblas_t gotoblas_POWER8;
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
extern gotoblas_t gotoblas_POWER9;
#endif
extern void openblas_warning(int verbose, const char *msg);
@ -19,7 +21,9 @@ static char *corename[] = {
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_POWER6) return corename[1];
if (gotoblas == &gotoblas_POWER8) return corename[2];
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
if (gotoblas == &gotoblas_POWER9) return corename[3];
#endif
return corename[0];
}
@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_POWER6;
if (__builtin_cpu_is("power8"))
return &gotoblas_POWER8;
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
if (__builtin_cpu_is("power9"))
return &gotoblas_POWER9;
#endif
return NULL;
}
@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
{
case 1: return (&gotoblas_POWER6);
case 2: return (&gotoblas_POWER8);
#if (!defined C_GCC) || (GCC_VERSION >= 60000)
case 3: return (&gotoblas_POWER9);
#endif
default: return NULL;
}
snprintf(message, 128, "Core not found: %s\n", coretype);

View File

@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
ifeq ($(CORE), GENERIC)
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), GENERIC)
USE_TRMM = 1
endif
endif
ifeq ($(CORE), HASWELL)
USE_TRMM = 1

View File

@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#SMINKERNEL = ../arm/min.c
#DMINKERNEL = ../arm/min.c
#
ISAMAXKERNEL = isamax.c
ISAMAXKERNEL = isamax_power8.S
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
ICAMAXKERNEL = icamax_power8.S
IZAMAXKERNEL = izamax.c
#
ISAMINKERNEL = isamin.c
ISAMINKERNEL = isamin_power8.S
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
ICAMINKERNEL = icamin_power8.S
IZAMINKERNEL = izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c
#
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
CAXPYKERNEL = caxpy_power8.S
ZAXPYKERNEL = zaxpy.c
#
SCOPYKERNEL = scopy.c

View File

@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#SMINKERNEL = ../arm/min.c
#DMINKERNEL = ../arm/min.c
#
ISAMAXKERNEL = isamax.c
ISAMAXKERNEL = isamax_power9.S
IDAMAXKERNEL = idamax.c
ICAMAXKERNEL = icamax.c
ICAMAXKERNEL = icamax_power9.S
IZAMAXKERNEL = izamax.c
#
ISAMINKERNEL = isamin.c
ISAMINKERNEL = isamin_power9.S
IDAMINKERNEL = idamin.c
ICAMINKERNEL = icamin.c
ICAMINKERNEL = icamin_power9.S
IZAMINKERNEL = izamin.c
#
#ISMAXKERNEL = ../arm/imax.c
@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c
#
SAXPYKERNEL = saxpy.c
DAXPYKERNEL = daxpy.c
CAXPYKERNEL = caxpy.c
CAXPYKERNEL = caxpy_power9.S
ZAXPYKERNEL = zaxpy.c
#
SCOPYKERNEL = scopy.c
@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot.c
DDOTKERNEL = ddot.c
DSDOTKERNEL = sdot.c
CDOTKERNEL = cdot.c
CDOTKERNEL = cdot_power9.S
ZDOTKERNEL = zdot.c
#
SNRM2KERNEL = ../arm/nrm2.c

574
kernel/power/caxpy_power8.S Normal file
View File

@ -0,0 +1,574 @@
#define ASSEMBLER
#include "common.h"
/*
.file "caxpy.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl caxpy_k
.type caxpy_k, @function
*/
PROLOGUE
caxpy_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry caxpy_k,.-caxpy_k
mr. 7,3
ble 0,.L33
cmpdi 7,9,1
beq 7,.L41
.L3:
mtctr 7
ld 7,96(1)
sldi 9,9,3
sldi 7,7,3
.p2align 4,,15
.L14:
lfs 10,4(8)
lfs 11,0(8)
lfs 12,0(10)
lfs 0,4(10)
fmuls 10,2,10
#ifdef CONJ
fmsubs 11,11,1,10
#else
fmadds 11,11,1,10
#endif
fadds 12,12,11
stfs 12,0(10)
lfs 11,0(8)
lfs 12,4(8)
add 8,8,9
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,12,1,11
fsubs 0,0,12
#else
fmadds 12,12,1,11
fadds 0,0,12
#endif
stfs 0,4(10)
add 10,10,7
bdnz .L14
.L33:
li 3,0
blr
.p2align 4,,15
.L41:
ld 6,96(1)
cmpdi 7,6,1
bne 7,.L3
rldicr. 4,7,0,59
std 31,-8(1)
li 11,0
bne 0,.L42
.L4:
addi 6,11,8
subf 0,4,7
sldi 6,6,2
addi 9,6,-32
add 5,10,6
add 3,8,9
add 6,8,6
subfc 5,5,3
add 9,10,9
subfe 5,5,5
subfc 6,6,9
subfe 31,31,31
addi 6,5,1
addi 5,31,1
or 6,6,5
rlwinm 6,6,0,0xff
cmpwi 7,6,0
beq 7,.L7
sradi 6,4,63
srdi 5,7,63
subfc 31,7,4
adde 6,5,6
subfic 31,0,3
subfe 31,31,31
xori 6,6,0x1
neg 31,31
and 6,6,31
rlwinm 6,6,0,0xff
cmpwi 7,6,0
beq 7,.L7
cmpd 7,4,7
li 6,1
blt 7,.L43
.L9:
addi 0,7,-1
subf 0,4,0
subfic 0,0,3
subfe 31,31,31
addi 0,31,1
rlwinm 0,0,0,0xff
cmpwi 7,0,0
bne 7,.L10
sradi 0,4,63
subfc 31,7,4
adde 5,5,0
rlwinm 5,5,0,0xff
cmpwi 7,5,0
bne 7,.L10
addi 0,6,-1
addis 31,2,.LC3@toc@ha
std 30,-16(1)
xscvdpspn 12,1
xscvdpspn 11,2
srdi. 30,0,2
addis 6,2,.LC2@toc@ha
addi 6,6,.LC2@toc@l
mtctr 30
addi 31,31,.LC3@toc@l
lxvd2x 42,0,6
li 5,16
li 6,0
lxvd2x 41,0,31
xxspltw 12,12,0
xxspltw 11,11,0
xxpermdi 42,42,42,2
xxpermdi 41,41,41,2
beq 0,.L44
.p2align 4,,15
.L11:
#ifdef CONJ
lxvd2x 44,3,6
lxvd2x 45,3,5
lxvd2x 33,9,6
lxvd2x 0,9,5
xxpermdi 44,44,44,2
xxpermdi 45,45,45,2
xxpermdi 32,33,33,2
xxpermdi 33,0,0,2
vperm 11,13,12,10
vperm 13,13,12,9
vperm 12,1,0,10
vperm 1,1,0,9
xvmulsp 0,11,43
xvmulsp 32,11,45
xvmsubmsp 45,12,0
xvmaddasp 32,12,43
xvaddsp 44,32,44
xvsubsp 32,33,45
vmrglw 1,0,12
vmrghw 0,0,12
#else
lxvd2x 45,3,6
lxvd2x 33,3,5
lxvd2x 43,9,6
lxvd2x 0,9,5
xxpermdi 45,45,45,2
xxpermdi 33,33,33,2
xxpermdi 32,43,43,2
xxpermdi 43,0,0,2
vperm 12,1,13,10
vperm 1,1,13,9
vperm 13,11,0,10
vperm 11,11,0,9
xvmulsp 0,11,44
xvmulsp 32,11,33
xvmaddmsp 33,12,0
xvmsubasp 32,12,44
xvaddsp 45,32,45
xvaddsp 32,33,43
vmrglw 1,0,13
vmrghw 0,0,13
#endif
xxpermdi 0,33,33,2
xxpermdi 32,32,32,2
stxvd2x 0,9,6
addi 6,6,32
stxvd2x 32,9,5
addi 5,5,32
bdnz .L11
rldicr 0,0,0,61
ld 30,-16(1)
sldi 9,0,1
add 4,4,0
add 11,11,9
.L10:
sldi 6,11,2
addi 9,4,1
addi 5,6,4
cmpd 7,7,9
lfsx 12,8,6
lfsx 0,10,6
addi 9,11,2
lfsx 11,8,5
fmuls 11,2,11
#ifdef CONJ
fmadds 12,12,1,11
#else
fmsubs 12,12,1,11
#endif
fadds 0,0,12
stfsx 0,10,6
lfsx 11,8,6
lfsx 12,8,5
lfsx 0,10,5
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,12,1,11
fsubs 0,0,12
#else
fmadds 12,12,1,11
fadds 0,0,12
#endif
stfsx 0,10,5
ble 7,.L39
sldi 9,9,2
addi 6,4,2
addi 5,9,4
cmpd 7,7,6
lfsx 12,8,9
lfsx 0,10,9
addi 6,11,4
lfsx 11,8,5
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,9
lfsx 11,8,9
lfsx 12,8,5
lfsx 0,10,5
fmuls 11,2,11
fmsubs 12,1,12,11
fsubs 0,0,12
stfsx 0,10,5
ble 7,.L39
sldi 6,6,2
addi 4,4,3
addi 5,6,4
cmpd 7,7,4
lfsx 12,8,6
lfsx 0,10,6
addi 9,11,6
lfsx 11,8,5
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,6
lfsx 11,8,6
lfsx 12,8,5
lfsx 0,10,5
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,1,12,11
fsubs 0,0,12
#else
fmadds 12,1,12,11
fadds 0,0,12
#endif
stfsx 0,10,5
ble 7,.L39
sldi 9,9,2
ld 31,-8(1)
addi 7,9,4
lfsx 12,8,9
lfsx 0,10,9
lfsx 11,8,7
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,9
lfsx 11,8,9
lfsx 12,8,7
lfsx 0,10,7
fmuls 2,2,11
#ifdef CONJ
fmsubs 1,1,12,2
fsubs 1,0,1
#else
fmadds 1,1,12,2
fadds 1,0,1
#endif
stfsx 1,10,7
b .L33
.L43:
mr 6,0
b .L9
.L7:
addi 10,4,1
cmpd 7,10,7
subf 10,4,7
mtctr 10
bgt 7,.L26
li 10,-1
rldicr 10,10,0,0
cmpd 7,7,10
beq 7,.L26
.p2align 4,,15
.L13:
lfs 10,4(3)
lfs 11,0(3)
addi 9,9,8
addi 3,3,8
lfs 12,-8(9)
lfs 0,-4(9)
fmuls 10,2,10
#ifdef CONJ
fmadds 11,1,11,10
#else
fmsubs 11,1,11,10
#endif
fadds 12,12,11
stfs 12,-8(9)
lfs 11,-8(3)
lfs 12,-4(3)
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,1,12,11
fsubs 0,0,12
#else
fmadds 12,1,12,11
fadds 0,0,12
#endif
stfs 0,-4(9)
bdnz .L13
.L39:
ld 31,-8(1)
b .L33
.L42:
#ifdef CONJ
fneg 0,1
xxpermdi 32,1,1,0
addis 9,2,.LANCHOR0@toc@ha
std 28,-32(1)
sradi. 28,4,1
addi 9,9,.LANCHOR0@toc@l
xscvdpspn 5,2
xvcvdpsp 32,32
lxvd2x 12,0,9
xxpermdi 39,0,0,0
xxspltw 5,5,0
xvcvdpsp 39,39
#else
fneg 0,2
xxpermdi 39,2,2,0
addis 9,2,.LANCHOR0@toc@ha
std 28,-32(1)
sradi. 28,4,1
addi 9,9,.LANCHOR0@toc@l
xscvdpspn 5,1
xvcvdpsp 39,39
lxvd2x 12,0,9
xxpermdi 32,0,0,0
xxspltw 5,5,0
xvcvdpsp 32,32
#endif
xxpermdi 12,12,12,2
vmrgew 7,7,0
beq 0,.L5
xxlnor 38,12,12
std 29,-24(1)
std 30,-16(1)
mr 6,8
mr 9,10
li 29,0
li 30,16
li 31,32
li 12,48
li 0,64
li 11,80
li 3,96
li 5,112
.p2align 4,,15
.L6:
lxvd2x 6,0,9
lxvd2x 40,0,6
addi 29,29,8
lxvd2x 41,6,30
lxvd2x 42,6,31
cmpd 7,28,29
lxvd2x 43,6,12
lxvd2x 44,6,0
lxvd2x 45,6,11
lxvd2x 33,6,3
lxvd2x 32,6,5
lxvd2x 7,9,30
addi 6,6,128
lxvd2x 8,9,31
lxvd2x 9,9,12
xxpermdi 40,40,40,2
xxpermdi 6,6,6,2
lxvd2x 10,9,0
lxvd2x 11,9,11
xxpermdi 41,41,41,2
xxpermdi 42,42,42,2
lxvd2x 12,9,3
lxvd2x 0,9,5
xxpermdi 43,43,43,2
xxpermdi 44,44,44,2
xxpermdi 45,45,45,2
xxpermdi 33,33,33,2
xxpermdi 32,32,32,2
xxpermdi 7,7,7,2
xxpermdi 8,8,8,2
xxpermdi 9,9,9,2
xxpermdi 10,10,10,2
xxpermdi 11,11,11,2
xxpermdi 12,12,12,2
xxpermdi 0,0,0,2
#ifndef CONJ
xvmaddasp 6,5,40
xvmaddasp 7,5,41
xvmaddasp 8,5,42
xvmaddasp 9,5,43
xvmaddasp 10,5,44
xvmaddasp 11,5,45
xvmaddasp 12,5,33
xvmaddasp 0,5,32
vperm 8,8,8,6
vperm 9,9,9,6
vperm 10,10,10,6
vperm 11,11,11,6
vperm 12,12,12,6
vperm 13,13,13,6
vperm 1,1,1,6
vperm 0,0,0,6
#endif
xvmaddasp 6,39,40
xvmaddasp 7,39,41
xvmaddasp 8,39,42
xvmaddasp 9,39,43
xvmaddasp 10,39,44
xvmaddasp 11,39,45
xvmaddasp 12,39,33
xvmaddasp 0,39,32
#ifdef CONJ
vperm 8,8,8,6
vperm 9,9,9,6
vperm 10,10,10,6
vperm 11,11,11,6
vperm 12,12,12,6
vperm 13,13,13,6
vperm 1,1,1,6
vperm 0,0,0,6
xvmaddasp 6,5,40
xvmaddasp 7,5,41
xvmaddasp 8,5,42
xvmaddasp 9,5,43
xvmaddasp 10,5,44
xvmaddasp 11,5,45
xvmaddasp 12,5,33
xvmaddasp 0,5,32
#endif
xxpermdi 6,6,6,2
xxpermdi 7,7,7,2
xxpermdi 8,8,8,2
xxpermdi 9,9,9,2
stxvd2x 6,0,9
xxpermdi 10,10,10,2
stxvd2x 7,9,30
xxpermdi 11,11,11,2
stxvd2x 8,9,31
xxpermdi 12,12,12,2
stxvd2x 9,9,12
xxpermdi 0,0,0,2
stxvd2x 10,9,0
stxvd2x 11,9,11
stxvd2x 12,9,3
stxvd2x 0,9,5
addi 9,9,128
bgt 7,.L6
ld 29,-24(1)
ld 30,-16(1)
.L5:
cmpd 7,7,4
ble 7,.L36
sldi 11,4,1
ld 28,-32(1)
b .L4
.L36:
ld 28,-32(1)
ld 31,-8(1)
b .L33
.L44:
li 31,1
mtctr 31
b .L11
.L26:
li 10,1
mtctr 10
b .L13
.long 0
.byte 0,0,0,0,0,4,0,0
.size caxpy_k,.-caxpy_k
.section .rodata
.align 4
.set .LANCHOR0,. + 0
.type swap_mask_arr, @object
.size swap_mask_arr, 16
swap_mask_arr:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 0
.byte 1
.byte 2
.byte 3
.byte 12
.byte 13
.byte 14
.byte 15
.byte 8
.byte 9
.byte 10
.byte 11
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 31
.byte 30
.byte 29
.byte 28
.byte 23
.byte 22
.byte 21
.byte 20
.byte 15
.byte 14
.byte 13
.byte 12
.byte 7
.byte 6
.byte 5
.byte 4
.LC3:
.byte 27
.byte 26
.byte 25
.byte 24
.byte 19
.byte 18
.byte 17
.byte 16
.byte 11
.byte 10
.byte 9
.byte 8
.byte 3
.byte 2
.byte 1
.byte 0
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.gnu_attribute 4, 1
.section .note.GNU-stack,"",@progbits

538
kernel/power/caxpy_power9.S Normal file
View File

@ -0,0 +1,538 @@
#define ASSEMBLER
#include "common.h"
/*
.file "caxpy.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl caxpy_k
.type caxpy_k, @function
*/
PROLOGUE
caxpy_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry caxpy_k,.-caxpy_k
mr. 7,3
ble 0,.L33
cmpdi 7,9,1
beq 7,.L37
.L3:
mtctr 7
ld 7,96(1)
sldi 9,9,3
sldi 7,7,3
.p2align 4,,15
.L14:
lfs 10,4(8)
lfs 11,0(8)
lfs 12,0(10)
lfs 0,4(10)
fmuls 10,2,10
#ifdef CONJ
fmadds 11,11,1,10
#else
fmsubs 11,11,1,10
#endif
fadds 12,12,11
stfs 12,0(10)
lfs 11,0(8)
lfs 12,4(8)
add 8,8,9
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,12,1,11
fsubs 0,0,12
#else
fmadds 12,12,1,11
fadds 0,0,12
#endif
stfs 0,4(10)
add 10,10,7
bdnz .L14
.L33:
li 3,0
blr
.p2align 4,,15
.L37:
ld 6,96(1)
cmpdi 7,6,1
bne 7,.L3
rldicr. 4,7,0,59
li 11,0
bne 0,.L38
.L4:
addi 6,11,8
subf 0,4,7
sldi 6,6,2
addi 9,6,-32
add 5,10,6
add 6,8,6
add 3,8,9
add 9,10,9
subfc 5,5,3
subfe 5,5,5
subfc 6,6,9
subfe 12,12,12
addi 6,5,1
addi 5,12,1
or 6,6,5
rlwinm 6,6,0,0xff
cmpwi 7,6,0
beq 7,.L7
sradi 6,4,63
srdi 5,7,63
subfc 12,7,4
adde 6,5,6
subfic 12,0,4
subfe 12,12,12
xori 6,6,0x1
neg 12,12
and 6,6,12
rlwinm 6,6,0,0xff
cmpwi 7,6,0
beq 7,.L7
cmpd 7,4,7
li 6,1
blt 7,.L39
.L9:
addi 0,7,-1
subf 0,4,0
subfic 0,0,3
subfe 12,12,12
addi 0,12,1
rlwinm 0,0,0,0xff
cmpwi 7,0,0
bne 7,.L10
sradi 0,4,63
subfc 12,7,4
adde 5,5,0
rlwinm 5,5,0,0xff
cmpwi 7,5,0
bne 7,.L10
xscvdpspn 0,1
xscvdpspn 12,2
addi 0,6,-1
std 31,-8(1)
addis 12,2,.LC2@toc@ha
addis 6,2,.LC3@toc@ha
li 5,16
srdi. 31,0,2
addi 6,6,.LC3@toc@l
addi 12,12,.LC2@toc@l
mtctr 31
lxv 41,0(6)
lxv 42,0(12)
li 6,0
xxspltw 0,0,0
xxspltw 12,12,0
beq 0,.L40
.p2align 4,,15
.L11:
#ifdef CONJ
lxvx 33,3,5
lxvx 44,3,6
lxvx 43,9,6
lxvx 32,9,5
vperm 13,1,12,10
vperm 12,1,12,9
vperm 8,0,11,10
vperm 0,0,11,9
xvmulsp 33,12,44
xvmulsp 11,12,45
xvmaddasp 33,0,45
xvmsubmsp 44,0,11
xvaddsp 33,33,40
xvsubsp 32,32,44
#else
lxvx 33,3,6
lxvx 32,3,5
lxvx 43,9,6
lxvx 44,9,5
vperm 13,0,1,10
vperm 0,0,1,9
vperm 8,12,11,10
vperm 12,12,11,9
xvmulsp 33,12,32
xvmulsp 11,12,45
xvmsubasp 33,0,45
xvmaddmsp 32,0,11
xvaddsp 33,33,40
xvaddsp 32,32,44
#endif
vmrglw 13,0,1
vmrghw 0,0,1
stxvx 45,9,6
stxvx 32,9,5
addi 6,6,32
addi 5,5,32
bdnz .L11
rldicr 0,0,0,61
ld 31,-8(1)
sldi 9,0,1
add 4,4,0
add 11,11,9
.L10:
sldi 5,11,2
addi 6,4,1
addi 9,11,2
addi 3,5,4
lfsx 12,8,5
cmpd 7,7,6
lfsx 0,10,5
lfsx 11,8,3
fmuls 11,2,11
#ifdef CONJ
fmadds 12,12,1,11
#else
fmsubs 12,12,1,11
#endif
fadds 0,0,12
stfsx 0,10,5
lfsx 11,8,5
lfsx 12,8,3
lfsx 0,10,3
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,12,1,11
fsubs 0,0,12
#else
fmadds 12,12,1,11
fadds 0,0,12
#endif
stfsx 0,10,3
ble 7,.L33
sldi 9,9,2
addi 5,4,2
addi 6,11,4
addi 3,9,4
lfsx 12,8,9
cmpd 7,7,5
lfsx 0,10,9
lfsx 11,8,3
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,9
lfsx 11,8,9
lfsx 12,8,3
lfsx 0,10,3
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,1,12,11
fsubs 0,0,12
#else
fmadds 12,1,12,11
fadds 0,0,12
#endif
stfsx 0,10,3
ble 7,.L33
sldi 6,6,2
addi 4,4,3
addi 9,11,6
addi 5,6,4
lfsx 12,8,6
cmpd 7,7,4
lfsx 0,10,6
lfsx 11,8,5
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,6
lfsx 11,8,6
lfsx 12,8,5
lfsx 0,10,5
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,1,12,11
fsubs 0,0,12
#else
fmadds 12,1,12,11
fadds 0,0,12
#endif
stfsx 0,10,5
ble 7,.L33
sldi 9,9,2
addi 7,9,4
lfsx 12,8,9
lfsx 0,10,9
lfsx 11,8,7
fmuls 11,2,11
#ifdef CONJ
fmadds 12,1,12,11
#else
fmsubs 12,1,12,11
#endif
fadds 0,0,12
stfsx 0,10,9
lfsx 11,8,9
lfsx 12,8,7
lfsx 0,10,7
fmuls 2,2,11
#ifdef CONJ
fmsubs 1,1,12,2
fsubs 1,0,1
#else
fmadds 1,1,12,2
fadds 1,0,1
#endif
stfsx 1,10,7
b .L33
.L39:
mr 6,0
b .L9
.L38:
#ifdef CONJ
fneg 0,1
xxpermdi 45,1,1,0
xscvdpspn 12,2
addis 9,2,.LANCHOR0@toc@ha
sradi. 3,4,1
xxpermdi 44,0,0,0
addi 9,9,.LANCHOR0@toc@l
xvcvdpsp 45,45
lxv 33,0(9)
xvcvdpsp 32,44
xxspltw 12,12,0
#else
fneg 12,2
xxpermdi 32,2,2,0
xscvdpspn 0,1
addis 9,2,.LANCHOR0@toc@ha
sradi. 3,4,1
xxpermdi 45,12,12,0
addi 9,9,.LANCHOR0@toc@l
xvcvdpsp 32,32
lxv 33,0(9)
xvcvdpsp 45,45
xxspltw 0,0,0
#endif
vmrgew 0,0,13
beq 0,.L5
mr 6,8
mr 9,10
li 5,0
.p2align 4,,15
.L6:
lxv 38,16(6)
lxv 11,16(9)
addi 5,5,8
addi 6,6,128
addi 9,9,128
lxv 39,-96(6)
lxv 40,-80(6)
lxv 41,-64(6)
lxv 42,-48(6)
cmpd 7,3,5
lxv 43,-32(6)
lxv 45,-128(6)
lxv 44,-16(6)
#ifdef CONJ
lxv 0,-128(9)
vpermr 17,6,6,1
xvmaddmsp 38,32,11
lxv 11,-96(9)
vpermr 18,7,7,1
vpermr 19,8,8,1
vpermr 2,9,9,1
vpermr 3,10,10,1
vpermr 4,11,11,1
xvmaddasp 0,32,45
vpermr 5,12,12,1
xvmaddmsp 39,32,11
lxv 11,-80(9)
vpermr 13,13,13,1
xvmaddasp 38,12,49
xvmaddmsp 40,32,11
lxv 11,-64(9)
xvmaddmsp 45,12,0
xvmaddasp 39,12,50
stxv 38,-112(9)
xvmaddmsp 41,32,11
lxv 11,-48(9)
xvmaddasp 40,12,51
stxv 45,-128(9)
stxv 39,-96(9)
xvmaddmsp 42,32,11
lxv 11,-32(9)
xvmaddasp 41,12,34
stxv 40,-80(9)
xvmaddmsp 43,32,11
lxv 11,-16(9)
xvmaddasp 42,12,35
stxv 41,-64(9)
xvmaddmsp 44,32,11
xvmaddasp 43,12,36
stxv 42,-48(9)
xvmaddasp 44,12,37
#else
lxv 12,-128(9)
vpermr 17,6,6,1
xvmaddmsp 38,0,11
lxv 11,-96(9)
vpermr 18,7,7,1
vpermr 19,8,8,1
vpermr 2,9,9,1
vpermr 3,10,10,1
vpermr 4,11,11,1
xvmaddasp 12,0,45
vpermr 5,12,12,1
xvmaddmsp 39,0,11
lxv 11,-80(9)
vpermr 13,13,13,1
xvmaddasp 38,32,49
xvmaddmsp 40,0,11
lxv 11,-64(9)
xvmaddmsp 45,32,12
xvmaddasp 39,32,50
stxv 38,-112(9)
xvmaddmsp 41,0,11
lxv 11,-48(9)
xvmaddasp 40,32,51
stxv 45,-128(9)
stxv 39,-96(9)
xvmaddmsp 42,0,11
lxv 11,-32(9)
xvmaddasp 41,32,34
stxv 40,-80(9)
xvmaddmsp 43,0,11
lxv 11,-16(9)
xvmaddasp 42,32,35
stxv 41,-64(9)
xvmaddmsp 44,0,11
xvmaddasp 43,32,36
stxv 42,-48(9)
xvmaddasp 44,32,37
#endif
stxv 43,-32(9)
stxv 44,-16(9)
bgt 7,.L6
.L5:
cmpd 7,7,4
ble 7,.L33
sldi 11,4,1
b .L4
.L7:
addi 10,4,1
subf 8,4,7
cmpd 7,10,7
mtctr 8
bgt 7,.L26
li 10,-1
rldicr 10,10,0,0
cmpd 7,7,10
beq 7,.L26
.p2align 4,,15
.L13:
lfs 10,4(3)
lfs 11,0(3)
lfs 12,0(9)
lfs 0,4(9)
addi 3,3,8
addi 9,9,8
fmuls 10,2,10
#ifdef CONJ
fmadds 11,1,11,10
#else
fmsubs 11,1,11,10
#endif
fadds 12,12,11
stfs 12,-8(9)
lfs 11,-8(3)
lfs 12,-4(3)
fmuls 11,2,11
#ifdef CONJ
fmsubs 12,1,12,11
fsubs 0,0,12
#else
fmadds 12,1,12,11
fadds 0,0,12
#endif
stfs 0,-4(9)
bdnz .L13
b .L33
.L40:
li 31,1
mtctr 31
b .L11
.L26:
li 10,1
mtctr 10
b .L13
.long 0
.byte 0,0,0,0,0,1,0,0
.size caxpy_k,.-caxpy_k
.section .rodata
.align 4
.set .LANCHOR0,. + 0
.type swap_mask_arr, @object
.size swap_mask_arr, 16
swap_mask_arr:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 0
.byte 1
.byte 2
.byte 3
.byte 12
.byte 13
.byte 14
.byte 15
.byte 8
.byte 9
.byte 10
.byte 11
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 31
.byte 30
.byte 29
.byte 28
.byte 23
.byte 22
.byte 21
.byte 20
.byte 15
.byte 14
.byte 13
.byte 12
.byte 7
.byte 6
.byte 5
.byte 4
.LC3:
.byte 27
.byte 26
.byte 25
.byte 24
.byte 19
.byte 18
.byte 17
.byte 16
.byte 11
.byte 10
.byte 9
.byte 8
.byte 3
.byte 2
.byte 1
.byte 0
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.gnu_attribute 4, 1
.section .note.GNU-stack,"",@progbits

242
kernel/power/cdot_power9.S Normal file
View File

@ -0,0 +1,242 @@
.file "cdot.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl cdot_k
.type cdot_k, @function
cdot_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry cdot_k,.-cdot_k
mr. 9,3
ble 0,.L10
cmpdi 7,5,1
beq 7,.L18
.L3:
mtctr 9
xxlxor 2,2,2
sldi 5,5,3
sldi 7,7,3
#ifdef CONJ
fmr 12,2
#endif
fmr 8,2
#ifndef CONJ
fmr 9,2
#endif
fmr 1,2
.p2align 4,,15
.L9:
#ifdef CONJ
lfs 9,0(4)
lfs 11,0(6)
lfs 10,4(6)
lfs 0,4(4)
add 6,6,7
add 4,4,5
fmadds 1,9,11,1
fmadds 12,9,10,12
fmadds 8,0,10,8
fmadds 2,11,0,2
#else
lfs 10,0(4)
lfs 12,0(6)
lfs 11,4(6)
lfs 0,4(4)
add 6,6,7
add 4,4,5
fmadds 1,10,12,1
fmadds 8,10,11,8
fmadds 9,0,11,9
fmadds 2,12,0,2
#endif
bdnz .L9
.L7:
#ifdef CONJ
fsubs 2,12,2
fadds 1,1,8
#else
fadds 2,2,8
fsubs 1,1,9
#endif
blr
.p2align 4,,15
.L18:
cmpdi 7,7,1
bne 7,.L3
rldicr. 10,9,0,60
bne 0,.L19
xxlxor 2,2,2
li 8,0
#ifdef CONJ
fmr 12,2
#endif
fmr 8,2
#ifndef CONJ
fmr 9,2
#endif
fmr 1,2
.L4:
addi 7,10,1
sldi 8,8,2
subf 10,10,9
cmpd 7,7,9
mtctr 10
add 4,4,8
add 6,6,8
bgt 7,.L16
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L16
.p2align 4,,15
.L8:
#ifdef CONJ
lfs 9,0(4)
lfs 11,0(6)
lfs 10,4(6)
lfs 0,4(4)
addi 6,6,8
addi 4,4,8
fmadds 1,9,11,1
fmadds 12,9,10,12
fmadds 8,0,10,8
fmadds 2,11,0,2
#else
lfs 10,0(4)
lfs 12,0(6)
lfs 11,4(6)
lfs 0,4(4)
addi 6,6,8
addi 4,4,8
fmadds 1,10,12,1
fmadds 8,10,11,8
fmadds 9,0,11,9
fmadds 2,12,0,2
#endif
bdnz .L8
b .L7
.p2align 4,,15
.L10:
xxlxor 1,1,1
fmr 2,1
blr
.L19:
addis 8,2,.LANCHOR0@toc@ha
sradi. 3,10,1
xxspltib 42,0
addi 8,8,.LANCHOR0@toc@l
lxv 32,0(8)
beq 0,.L12
xxlor 6,42,42
xxlor 4,42,42
xxlor 0,42,42
xxlor 7,42,42
xxlor 5,42,42
xxlor 3,42,42
xxlor 12,42,42
mr 7,4
mr 8,6
li 5,0
.p2align 4,,15
.L6:
lxv 43,0(8)
lxv 44,16(8)
addi 5,5,4
addi 8,8,64
addi 7,7,64
lxv 45,-32(8)
lxv 33,-16(8)
lxv 8,-64(7)
lxv 9,-48(7)
cmpd 7,3,5
lxv 10,-32(7)
lxv 11,-16(7)
vpermr 6,11,11,0
vpermr 7,12,12,0
vpermr 8,13,13,0
vpermr 9,1,1,0
xvmaddasp 12,43,8
xvmaddasp 3,44,9
xvmaddasp 0,8,38
xvmaddasp 4,9,39
xvmaddasp 6,10,40
xvmaddasp 5,45,10
xvmaddasp 42,11,41
xvmaddasp 7,33,11
bgt 7,.L6
xvaddsp 12,12,3
xvaddsp 0,0,4
xvaddsp 12,12,5
xvaddsp 0,0,6
xvaddsp 12,12,7
xvaddsp 42,0,42
.L5:
#ifdef CONJ
xxpermdi 8,12,12,2
xxpermdi 0,42,42,2
cmpd 7,9,10
sldi 8,10,1
xvaddsp 8,8,12
xvaddsp 0,0,42
xxsldwi 1,8,8,3
xxsldwi 12,0,0,3
xxsldwi 8,8,8,2
xxsldwi 0,0,0,2
xscvspdp 1,1
xscvspdp 12,12
xscvspdp 8,8
#else
xxpermdi 9,12,12,2
xxpermdi 0,42,42,2
cmpd 7,9,10
sldi 8,10,1
xvaddsp 9,9,12
xvaddsp 0,0,42
xxsldwi 1,9,9,3
xxsldwi 2,0,0,3
xxsldwi 9,9,9,2
xxsldwi 0,0,0,2
xscvspdp 8,2
xscvspdp 1,1
xscvspdp 9,9
#endif
xscvspdp 2,0
bgt 7,.L4
b .L7
.L12:
xxlor 12,42,42
b .L5
.L16:
li 9,1
mtctr 9
b .L8
.long 0
.byte 0,0,0,0,0,0,0,0
.size cdot_k,.-cdot_k
.section .rodata
.align 4
.set .LANCHOR0,. + 0
.type swap_mask_arr, @object
.size swap_mask_arr, 16
swap_mask_arr:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 0
.byte 1
.byte 2
.byte 3
.byte 12
.byte 13
.byte 14
.byte 15
.byte 8
.byte 9
.byte 10
.byte 11
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,458 @@
/* .file "icamax.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl icamax_k
.type icamax_k, @function
*/
#define ASSEMBLER
#include "common.h"
PROLOGUE
icamax_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry icamax_k,.-icamax_k
mr. 9,3
ble 0,.L25
cmpdi 7,5,0
li 3,0
blelr 7
cmpdi 7,5,1
beq 7,.L54
lfs 11,0(4)
lfs 0,4(4)
cmpdi 7,9,1
fabs 11,11
fabs 0,0
fadds 11,11,0
beq 7,.L29
addi 9,9,-1
sldi 5,5,3
mtctr 9
add 4,4,5
li 3,0
li 9,1
.p2align 4,,15
.L24:
lfs 0,4(4)
lfs 12,0(4)
add 4,4,5
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L23
fmr 11,0
mr 3,9
.L23:
addi 9,9,1
bdnz .L24
.L52:
addi 3,3,1
blr
.p2align 4,,15
.L25:
li 3,0
blr
.p2align 4,,15
.L54:
rldicr. 8,9,0,58
bne 0,.L55
addi 7,8,1
li 10,0
xxlxor 11,11,11
cmpd 7,7,9
sldi 10,10,2
add 4,4,10
subf 10,8,9
mtctr 10
li 3,0
bgt 7,.L43
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L43
.p2align 4,,15
.L44:
lfs 0,4(4)
lfs 12,0(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L46
fmr 11,0
mr 3,8
.L46:
addi 8,8,1
bdnz .L44
b .L52
.p2align 4,,15
.L55:
li 0,-144
std 31,-8(1)
addis 5,2,.LC2@toc@ha
vspltisw 18,0
vspltisw 19,0
addis 6,2,.LC3@toc@ha
addi 5,5,.LC2@toc@l
stvx 24,1,0
li 0,-128
addi 6,6,.LC3@toc@l
xxlor 49,50,50
addis 7,2,.LC4@toc@ha
lxvd2x 44,0,5
addis 10,2,.LC5@toc@ha
stvx 25,1,0
li 0,-112
addi 7,7,.LC4@toc@l
lxvd2x 45,0,6
addis 5,2,.LC6@toc@ha
addis 6,2,.LC7@toc@ha
stvx 26,1,0
li 0,-96
addi 10,10,.LC5@toc@l
addi 6,6,.LC7@toc@l
addi 5,5,.LC6@toc@l
stvx 27,1,0
li 0,-80
lxvd2x 46,0,10
xxpermdi 44,44,44,2
mr 10,4
lxvd2x 48,0,6
lxvd2x 47,0,5
xxpermdi 45,45,45,2
li 6,0
stvx 28,1,0
li 0,-64
xxlnand 44,44,44
xxlnand 45,45,45
stvx 29,1,0
li 0,-48
vspltisw 29,8
vadduwm 29,29,29
xxpermdi 46,46,46,2
stvx 30,1,0
li 0,-32
xxpermdi 47,47,47,2
xxpermdi 48,48,48,2
stvx 31,1,0
lxvd2x 63,0,7
addis 7,2,.LC8@toc@ha
addi 7,7,.LC8@toc@l
lxvd2x 62,0,7
xxpermdi 63,63,63,2
.p2align 4,,15
.L5:
addi 3,10,16
addi 5,10,32
lxvd2x 34,0,10
addi 7,10,64
addi 31,10,48
addi 12,10,80
addi 11,10,96
lxvd2x 36,0,3
lxvd2x 37,0,5
addi 3,10,112
addi 5,10,128
lxvd2x 38,0,7
lxvd2x 7,0,31
addi 7,10,160
addi 31,10,144
lxvd2x 33,0,12
lxvd2x 39,0,11
addi 12,10,176
addi 11,10,192
lxvd2x 8,0,3
lxvd2x 40,0,5
xxpermdi 34,34,34,2
addi 3,10,208
addi 5,10,224
lxvd2x 41,0,7
lxvd2x 9,0,31
addi 7,10,240
lxvd2x 10,0,12
lxvd2x 42,0,11
xxpermdi 37,37,37,2
xxpermdi 36,36,36,2
addi 6,6,32
lxvd2x 32,0,3
lxvd2x 43,0,5
xxpermdi 7,7,7,2
xxpermdi 38,38,38,2
cmpd 7,8,6
addi 10,10,256
lxvd2x 11,0,7
xxpermdi 39,39,39,2
xxpermdi 33,33,33,2
xxpermdi 40,40,40,2
xxpermdi 8,8,8,2
xxpermdi 41,41,41,2
xxpermdi 9,9,9,2
xxpermdi 10,10,10,2
xxpermdi 42,42,42,2
xxpermdi 43,43,43,2
xxpermdi 32,32,32,2
xxpermdi 11,11,11,2
xvabssp 57,37
xvabssp 58,39
xvabssp 35,40
xvabssp 59,41
xvabssp 34,34
xvabssp 33,33
xvabssp 32,32
xvabssp 60,43
xvabssp 36,36
xvabssp 37,7
xvabssp 38,38
xvabssp 39,8
xvabssp 40,9
xvabssp 41,10
xvabssp 42,42
xvabssp 43,11
vperm 24,4,2,12
vperm 4,4,2,13
vperm 2,5,25,12
vperm 5,5,25,13
vperm 25,1,6,12
vperm 6,1,6,13
vperm 1,7,26,12
vperm 7,7,26,13
vperm 26,8,3,12
vperm 8,8,3,13
vperm 3,9,27,12
vperm 9,9,27,13
vperm 27,0,10,12
vperm 10,0,10,13
vperm 0,11,28,12
vperm 11,11,28,13
xvaddsp 12,33,39
xvaddsp 38,57,38
xvaddsp 0,32,43
xvaddsp 42,59,42
xvaddsp 36,56,36
xvaddsp 37,34,37
xvaddsp 40,58,40
xvaddsp 41,35,41
xvcmpgtsp 32,12,38
xvcmpgtsp 33,0,42
xvcmpgtsp 43,37,36
xvcmpgtsp 39,41,40
xxsel 12,38,12,32
xxsel 38,47,48,32
xxsel 0,42,0,33
xxsel 42,47,48,33
xxsel 37,36,37,43
xxsel 43,63,46,43
xxsel 41,40,41,39
xxsel 39,63,46,39
xvcmpgtsp 32,12,37
xvcmpgtsp 33,0,41
xxsel 12,37,12,32
xxsel 43,43,38,32
xxsel 0,41,0,33
xxsel 33,39,42,33
xvcmpgtsp 32,0,12
vadduwm 1,1,29
xxsel 0,12,0,32
xxsel 32,43,33,32
xvcmpgtsp 33,0,51
vadduwm 0,17,0
vadduwm 17,17,30
xxsel 50,50,32,33
xxsel 51,51,0,33
bgt 7,.L5
xxsldwi 11,51,51,3
xxsldwi 12,51,51,2
vspltw 0,18,3
xxsldwi 0,51,51,1
xscvspdp 11,11
xscvspdp 12,12
mfvsrwz 6,32
vspltw 0,18,2
xscvspdp 0,0
mfvsrwz 7,50
mfvsrwz 5,32
vspltw 0,18,0
xscvspdp 51,51
mfvsrwz 10,32
fcmpu 7,11,12
rldicl 3,6,0,32
fmr 10,0
rldicl 11,7,0,32
rldicl 31,5,0,32
rldicl 0,10,0,32
beq 7,.L56
bnl 7,.L8
fmr 11,12
mr 3,31
.L8:
xscmpudp 7,0,51
bne 7,.L11
cmplw 7,7,10
ble 7,.L12
mr 7,10
.L12:
rldicl 11,7,0,32
.L13:
fcmpu 7,11,10
beq 7,.L57
blt 7,.L58
.L17:
cmpd 7,9,8
ble 7,.L19
addi 7,8,1
sldi 10,8,1
cmpd 7,7,9
sldi 10,10,2
add 4,4,10
subf 10,8,9
mtctr 10
bgt 7,.L37
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L37
.p2align 4,,15
.L21:
lfs 0,4(4)
lfs 12,0(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L20
fmr 11,0
mr 3,8
.L20:
addi 8,8,1
bdnz .L21
.L19:
li 0,-144
ld 31,-8(1)
addi 3,3,1
lvx 24,1,0
li 0,-128
lvx 25,1,0
li 0,-112
lvx 26,1,0
li 0,-96
lvx 27,1,0
li 0,-80
lvx 28,1,0
li 0,-64
lvx 29,1,0
li 0,-48
lvx 30,1,0
li 0,-32
lvx 31,1,0
blr
.p2align 4,,15
.L56:
cmplw 7,6,5
ble 7,.L7
mr 6,5
.L7:
rldicl 3,6,0,32
b .L8
.p2align 4,,15
.L29:
li 3,1
blr
.p2align 4,,15
.L11:
bnl 7,.L13
xscpsgndp 10,51,51
mr 11,0
b .L13
.p2align 4,,15
.L57:
cmpd 7,3,11
ble 7,.L17
mr 3,11
b .L17
.p2align 4,,15
.L58:
fmr 11,10
mr 3,11
b .L17
.L43:
li 9,1
mtctr 9
b .L44
.L37:
li 9,1
mtctr 9
b .L21
.long 0
.byte 0,0,0,0,0,1,0,0
.size icamax_k,.-icamax_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 8
.byte 9
.byte 10
.byte 11
.byte 16
.byte 17
.byte 18
.byte 19
.byte 24
.byte 25
.byte 26
.byte 27
.LC3:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 12
.byte 13
.byte 14
.byte 15
.byte 20
.byte 21
.byte 22
.byte 23
.byte 28
.byte 29
.byte 30
.byte 31
.LC4:
.long 0
.long 1
.long 2
.long 3
.LC5:
.long 4
.long 5
.long 6
.long 7
.LC6:
.long 8
.long 9
.long 10
.long 11
.LC7:
.long 12
.long 13
.long 14
.long 15
.LC8:
.long 32
.long 32
.long 32
.long 32
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,387 @@
.file "icamax.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl icamax_k
.type icamax_k, @function
icamax_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry icamax_k,.-icamax_k
mr. 9,3
ble 0,.L25
cmpdi 7,5,0
li 3,0
blelr 7
cmpdi 7,5,1
beq 7,.L53
lfs 11,0(4)
lfs 0,4(4)
cmpdi 7,9,1
fabs 11,11
fabs 0,0
fadds 11,11,0
beq 7,.L29
addi 9,9,-1
sldi 5,5,3
li 3,0
mtctr 9
add 4,4,5
li 9,1
.p2align 4,,15
.L24:
lfs 0,4(4)
lfs 12,0(4)
add 4,4,5
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L23
fmr 11,0
mr 3,9
.L23:
addi 9,9,1
bdnz .L24
.L51:
addi 3,3,1
blr
.p2align 4,,15
.L25:
li 3,0
blr
.p2align 4,,15
.L53:
rldicr. 8,9,0,58
bne 0,.L54
addi 7,8,1
li 10,0
subf 6,8,9
li 3,0
xxlxor 11,11,11
cmpd 7,7,9
sldi 10,10,2
mtctr 6
add 4,4,10
bgt 7,.L43
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L43
.p2align 4,,15
.L44:
lfs 0,4(4)
lfs 12,0(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L46
fmr 11,0
mr 3,8
.L46:
addi 8,8,1
bdnz .L44
b .L51
.p2align 4,,15
.L54:
addis 11,2,.LC2@toc@ha
addis 3,2,.LC3@toc@ha
addis 5,2,.LC6@toc@ha
addis 6,2,.LC7@toc@ha
xxspltib 47,0
addis 7,2,.LC4@toc@ha
addis 10,2,.LC5@toc@ha
stxv 58,-96(1)
stxv 59,-80(1)
addi 11,11,.LC2@toc@l
addi 3,3,.LC3@toc@l
addi 5,5,.LC6@toc@l
addi 6,6,.LC7@toc@l
stxv 62,-32(1)
stxv 63,-16(1)
xxspltib 58,16
addi 7,7,.LC4@toc@l
addi 10,10,.LC5@toc@l
xxspltib 59,32
lxv 44,0(11)
lxv 45,0(3)
xxspltib 48,0
lxv 62,0(5)
xxlor 46,47,47
lxv 63,0(6)
stxv 60,-64(1)
stxv 61,-48(1)
lxv 60,0(7)
lxv 61,0(10)
li 7,0
mr 10,4
vextsb2w 26,26
vextsb2w 27,27
stxv 56,-128(1)
stxv 57,-112(1)
.p2align 4,,15
.L5:
lxv 0,0(10)
addi 7,7,32
addi 10,10,256
cmpd 7,8,7
xvabssp 34,0
lxv 0,-240(10)
xvabssp 42,0
lxv 0,-224(10)
xvabssp 49,0
lxv 0,-208(10)
vpermr 25,10,2,12
vpermr 2,10,2,13
xvabssp 35,0
lxv 0,-192(10)
xvaddsp 34,57,34
xvabssp 36,0
lxv 0,-176(10)
vpermr 10,3,17,12
vpermr 3,3,17,13
xvabssp 33,0
lxv 0,-160(10)
xvaddsp 10,42,35
xvabssp 50,0
lxv 0,-144(10)
vpermr 17,1,4,12
vpermr 4,1,4,13
xvabssp 37,0
lxv 0,-128(10)
xvaddsp 36,49,36
xvabssp 38,0
lxv 0,-112(10)
vpermr 1,5,18,12
vpermr 5,5,18,13
xvabssp 43,0
lxv 0,-96(10)
xvaddsp 12,33,37
xvabssp 51,0
lxv 0,-80(10)
vpermr 18,11,6,12
vpermr 6,11,6,13
xvabssp 39,0
lxv 0,-64(10)
xvaddsp 38,50,38
xvabssp 40,0
lxv 0,-48(10)
vpermr 11,7,19,12
vpermr 7,7,19,13
xvabssp 32,0
lxv 0,-32(10)
xvaddsp 11,43,39
xvcmpgtsp 39,10,34
xvcmpgtsp 43,12,36
xvabssp 56,0
lxv 0,-16(10)
vpermr 19,0,8,12
vpermr 8,0,8,13
xxsel 10,34,10,39
xxsel 12,36,12,43
xxsel 39,60,61,39
xxsel 43,62,63,43
xvabssp 41,0
xvaddsp 40,51,40
vpermr 0,9,24,12
vpermr 9,9,24,13
xvaddsp 0,32,41
xvcmpgtsp 41,11,38
xvcmpgtsp 32,12,10
xvcmpgtsp 42,0,40
xxsel 11,38,11,41
xxsel 12,10,12,32
xxsel 43,39,43,32
xxsel 41,60,61,41
xxsel 0,40,0,42
xxsel 42,62,63,42
xvcmpgtsp 33,0,11
xxsel 0,11,0,33
xxsel 33,41,42,33
xvcmpgtsp 32,0,12
vadduwm 1,1,26
xxsel 0,12,0,32
xxsel 32,43,33,32
xvcmpgtsp 33,0,48
vadduwm 0,14,0
vadduwm 14,14,27
xxsel 47,47,32,33
xxsel 48,48,0,33
bgt 7,.L5
xxsldwi 11,48,48,3
xxsldwi 12,48,48,2
li 10,0
li 3,12
xxsldwi 0,48,48,1
xscvspdp 48,48
vextuwrx 6,10,15
li 10,4
xscvspdp 11,11
xscvspdp 12,12
xscvspdp 0,0
vextuwrx 5,10,15
li 10,8
vextuwrx 7,10,15
vextuwrx 10,3,15
rldicl 12,5,0,32
rldicl 3,6,0,32
rldicl 11,7,0,32
rldicl 0,10,0,32
fcmpu 7,11,12
fmr 10,0
beq 7,.L55
bnl 7,.L8
mr 3,12
fmr 11,12
.L8:
xscmpudp 7,0,48
bne 7,.L11
cmplw 7,7,10
ble 7,.L12
mr 7,10
.L12:
rldicl 11,7,0,32
.L13:
fcmpu 7,11,10
beq 7,.L56
bnl 7,.L17
mr 3,11
fmr 11,10
.L17:
cmpd 7,9,8
ble 7,.L19
addi 7,8,1
sldi 10,8,1
subf 6,8,9
cmpd 7,7,9
sldi 10,10,2
mtctr 6
add 4,4,10
bgt 7,.L37
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L37
.p2align 4,,15
.L21:
lfs 0,4(4)
lfs 12,0(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bng 7,.L20
fmr 11,0
mr 3,8
.L20:
addi 8,8,1
bdnz .L21
.L19:
lxv 56,-128(1)
lxv 57,-112(1)
addi 3,3,1
lxv 58,-96(1)
lxv 59,-80(1)
lxv 60,-64(1)
lxv 61,-48(1)
lxv 62,-32(1)
lxv 63,-16(1)
blr
.p2align 4,,15
.L55:
cmplw 7,6,5
ble 7,.L7
mr 6,5
.L7:
rldicl 3,6,0,32
b .L8
.p2align 4,,15
.L29:
li 3,1
blr
.p2align 4,,15
.L11:
bnl 7,.L13
mr 11,0
xscpsgndp 10,48,48
b .L13
.p2align 4,,15
.L56:
cmpd 7,3,11
ble 7,.L17
mr 3,11
b .L17
.L37:
li 9,1
mtctr 9
b .L21
.L43:
li 9,1
mtctr 9
b .L44
.long 0
.byte 0,0,0,0,0,0,0,0
.size icamax_k,.-icamax_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 8
.byte 9
.byte 10
.byte 11
.byte 16
.byte 17
.byte 18
.byte 19
.byte 24
.byte 25
.byte 26
.byte 27
.LC3:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 12
.byte 13
.byte 14
.byte 15
.byte 20
.byte 21
.byte 22
.byte 23
.byte 28
.byte 29
.byte 30
.byte 31
.LC4:
.long 0
.long 1
.long 2
.long 3
.LC5:
.long 4
.long 5
.long 6
.long 7
.LC6:
.long 8
.long 9
.long 10
.long 11
.LC7:
.long 12
.long 13
.long 14
.long 15
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,454 @@
/* .file "icamin.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl icamin_k
.type icamin_k, @function
*/
#define ASSEMBLER
#include "common.h"
PROLOGUE
icamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry icamin_k,.-icamin_k
mr. 9,3
ble 0,.L25
cmpdi 7,5,0
li 3,0
blelr 7
lfs 11,0(4)
lfs 0,4(4)
cmpdi 7,5,1
fabs 11,11
fabs 0,0
fadds 11,11,0
beq 7,.L54
cmpdi 7,9,1
beq 7,.L29
addi 9,9,-1
sldi 5,5,3
mtctr 9
add 4,4,5
li 3,0
li 9,1
.p2align 4,,15
.L24:
lfs 0,4(4)
lfs 12,0(4)
add 4,4,5
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bnl 7,.L23
fmr 11,0
mr 3,9
.L23:
addi 9,9,1
bdnz .L24
.L52:
addi 3,3,1
blr
.p2align 4,,15
.L25:
li 3,0
blr
.p2align 4,,15
.L54:
rldicr. 8,9,0,58
bne 0,.L55
addi 7,8,1
li 10,0
cmpd 7,7,9
sldi 10,10,2
add 4,4,10
subf 10,8,9
mtctr 10
li 3,0
bgt 7,.L43
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L43
.p2align 4,,15
.L44:
lfs 0,0(4)
lfs 12,4(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,11,0
bng 7,.L46
fmr 11,0
mr 3,8
.L46:
addi 8,8,1
bdnz .L44
b .L52
.p2align 4,,15
.L55:
li 0,-128
std 31,-8(1)
addis 5,2,.LC2@toc@ha
xscvdpspn 11,11
vspltisw 19,0
addis 6,2,.LC3@toc@ha
addi 5,5,.LC2@toc@l
stvx 25,1,0
li 0,-112
addi 6,6,.LC3@toc@l
xxlor 50,51,51
addis 7,2,.LC4@toc@ha
lxvd2x 44,0,5
addis 10,2,.LC5@toc@ha
stvx 26,1,0
li 0,-96
addi 7,7,.LC4@toc@l
lxvd2x 45,0,6
addis 5,2,.LC6@toc@ha
addis 6,2,.LC7@toc@ha
stvx 27,1,0
li 0,-80
addi 10,10,.LC5@toc@l
xxspltw 5,11,0
addi 6,6,.LC7@toc@l
addi 5,5,.LC6@toc@l
stvx 28,1,0
li 0,-64
lxvd2x 47,0,10
xxpermdi 44,44,44,2
mr 10,4
lxvd2x 49,0,6
lxvd2x 48,0,5
xxpermdi 45,45,45,2
li 6,0
stvx 29,1,0
li 0,-48
xxlnand 44,44,44
xxlnand 45,45,45
stvx 30,1,0
lxvd2x 62,0,7
addis 7,2,.LC8@toc@ha
li 0,-32
addi 7,7,.LC8@toc@l
xxpermdi 47,47,47,2
stvx 31,1,0
vspltisw 31,8
xxpermdi 48,48,48,2
lxvd2x 46,0,7
vadduwm 31,31,31
xxpermdi 49,49,49,2
xxpermdi 62,62,62,2
.p2align 4,,15
.L5:
addi 3,10,16
addi 5,10,32
lxvd2x 34,0,10
addi 7,10,64
addi 31,10,48
addi 12,10,80
addi 11,10,96
lxvd2x 36,0,3
lxvd2x 37,0,5
addi 3,10,112
addi 5,10,128
lxvd2x 38,0,7
lxvd2x 6,0,31
addi 7,10,160
addi 31,10,144
lxvd2x 33,0,12
lxvd2x 39,0,11
addi 12,10,176
addi 11,10,192
lxvd2x 7,0,3
lxvd2x 40,0,5
xxpermdi 34,34,34,2
addi 3,10,208
addi 5,10,224
lxvd2x 41,0,7
lxvd2x 8,0,31
addi 7,10,240
lxvd2x 9,0,12
lxvd2x 42,0,11
xxpermdi 37,37,37,2
xxpermdi 36,36,36,2
addi 6,6,32
lxvd2x 32,0,3
lxvd2x 43,0,5
xxpermdi 6,6,6,2
xxpermdi 38,38,38,2
cmpd 7,8,6
addi 10,10,256
lxvd2x 10,0,7
xxpermdi 39,39,39,2
xxpermdi 33,33,33,2
xxpermdi 40,40,40,2
xxpermdi 7,7,7,2
xxpermdi 41,41,41,2
xxpermdi 8,8,8,2
xxpermdi 9,9,9,2
xxpermdi 42,42,42,2
xxpermdi 43,43,43,2
xxpermdi 32,32,32,2
xxpermdi 10,10,10,2
xvabssp 58,37
xvabssp 59,39
xvabssp 35,40
xvabssp 60,41
xvabssp 34,34
xvabssp 33,33
xvabssp 32,32
xvabssp 61,43
xvabssp 36,36
xvabssp 37,6
xvabssp 38,38
xvabssp 39,7
xvabssp 40,8
xvabssp 41,9
xvabssp 42,42
xvabssp 43,10
vperm 25,4,2,12
vperm 4,4,2,13
vperm 2,5,26,12
vperm 5,5,26,13
vperm 26,1,6,12
vperm 6,1,6,13
vperm 1,7,27,12
vperm 7,7,27,13
vperm 27,8,3,12
vperm 8,8,3,13
vperm 3,9,28,12
vperm 9,9,28,13
vperm 28,0,10,12
vperm 10,0,10,13
vperm 0,11,29,12
vperm 11,11,29,13
xvaddsp 12,33,39
xvaddsp 38,58,38
xvaddsp 0,32,43
xvaddsp 42,60,42
xvaddsp 36,57,36
xvaddsp 37,34,37
xvaddsp 40,59,40
xvaddsp 41,35,41
xvcmpgtsp 32,38,12
xvcmpgtsp 33,42,0
xvcmpgtsp 43,36,37
xvcmpgtsp 39,40,41
xxsel 12,38,12,32
xxsel 38,48,49,32
xxsel 0,42,0,33
xxsel 42,48,49,33
xxsel 37,36,37,43
xxsel 43,62,47,43
xxsel 41,40,41,39
xxsel 39,62,47,39
xvcmpgtsp 32,37,12
xvcmpgtsp 33,41,0
xxsel 12,37,12,32
xxsel 43,43,38,32
xxsel 0,41,0,33
xxsel 33,39,42,33
xvcmpgtsp 32,12,0
vadduwm 1,1,31
xxsel 0,12,0,32
xxsel 32,43,33,32
xvcmpgtsp 33,5,0
vadduwm 0,0,18
vadduwm 18,18,14
xxsel 51,51,32,33
xxsel 5,5,0,33
bgt 7,.L5
xxsldwi 11,5,5,3
xxsldwi 12,5,5,2
vspltw 0,19,3
xxsldwi 0,5,5,1
xscvspdp 11,11
xscvspdp 12,12
mfvsrwz 6,32
vspltw 0,19,2
xscvspdp 0,0
mfvsrwz 7,51
mfvsrwz 5,32
vspltw 0,19,0
xscvspdp 5,5
mfvsrwz 10,32
fcmpu 7,11,12
rldicl 3,6,0,32
fmr 10,0
rldicl 11,7,0,32
rldicl 31,5,0,32
rldicl 0,10,0,32
beq 7,.L56
bng 7,.L8
fmr 11,12
mr 3,31
.L8:
fcmpu 7,0,5
bne 7,.L11
cmplw 7,7,10
ble 7,.L12
mr 7,10
.L12:
rldicl 11,7,0,32
.L13:
fcmpu 7,11,10
beq 7,.L57
bgt 7,.L58
.L17:
cmpd 7,9,8
ble 7,.L19
addi 7,8,1
sldi 10,8,1
cmpd 7,7,9
sldi 10,10,2
add 4,4,10
subf 10,8,9
mtctr 10
bgt 7,.L37
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L37
.p2align 4,,15
.L21:
lfs 0,0(4)
lfs 12,4(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,11,0
bng 7,.L20
fmr 11,0
mr 3,8
.L20:
addi 8,8,1
bdnz .L21
.L19:
li 0,-128
ld 31,-8(1)
addi 3,3,1
lvx 25,1,0
li 0,-112
lvx 26,1,0
li 0,-96
lvx 27,1,0
li 0,-80
lvx 28,1,0
li 0,-64
lvx 29,1,0
li 0,-48
lvx 30,1,0
li 0,-32
lvx 31,1,0
blr
.p2align 4,,15
.L56:
cmplw 7,6,5
ble 7,.L7
mr 6,5
.L7:
rldicl 3,6,0,32
b .L8
.p2align 4,,15
.L29:
li 3,1
blr
.p2align 4,,15
.L11:
bng 7,.L13
fmr 10,5
mr 11,0
b .L13
.p2align 4,,15
.L57:
cmpd 7,3,11
ble 7,.L17
mr 3,11
b .L17
.p2align 4,,15
.L58:
fmr 11,10
mr 3,11
b .L17
.L43:
li 9,1
mtctr 9
b .L44
.L37:
li 9,1
mtctr 9
b .L21
.long 0
.byte 0,0,0,0,0,1,0,0
.size icamin_k,.-icamin_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 8
.byte 9
.byte 10
.byte 11
.byte 16
.byte 17
.byte 18
.byte 19
.byte 24
.byte 25
.byte 26
.byte 27
.LC3:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 12
.byte 13
.byte 14
.byte 15
.byte 20
.byte 21
.byte 22
.byte 23
.byte 28
.byte 29
.byte 30
.byte 31
.LC4:
.long 0
.long 1
.long 2
.long 3
.LC5:
.long 4
.long 5
.long 6
.long 7
.LC6:
.long 8
.long 9
.long 10
.long 11
.LC7:
.long 12
.long 13
.long 14
.long 15
.LC8:
.long 32
.long 32
.long 32
.long 32
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,385 @@
.file "icamin.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl icamin_k
.type icamin_k, @function
icamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry icamin_k,.-icamin_k
mr. 9,3
ble 0,.L25
cmpdi 7,5,0
li 3,0
blelr 7
lfs 11,0(4)
lfs 0,4(4)
cmpdi 7,5,1
fabs 11,11
fabs 0,0
fadds 11,11,0
beq 7,.L53
cmpdi 7,9,1
beq 7,.L29
addi 9,9,-1
sldi 5,5,3
li 3,0
mtctr 9
add 4,4,5
li 9,1
.p2align 4,,15
.L24:
lfs 0,4(4)
lfs 12,0(4)
add 4,4,5
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,0,11
bnl 7,.L23
fmr 11,0
mr 3,9
.L23:
addi 9,9,1
bdnz .L24
.L51:
addi 3,3,1
blr
.p2align 4,,15
.L25:
li 3,0
blr
.p2align 4,,15
.L53:
rldicr. 8,9,0,58
bne 0,.L54
addi 7,8,1
li 10,0
subf 6,8,9
li 3,0
cmpd 7,7,9
sldi 10,10,2
mtctr 6
add 4,4,10
bgt 7,.L43
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L43
.p2align 4,,15
.L44:
lfs 0,0(4)
lfs 12,4(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,11,0
bng 7,.L46
fmr 11,0
mr 3,8
.L46:
addi 8,8,1
bdnz .L44
b .L51
.p2align 4,,15
.L54:
xscvdpspn 9,11
addis 11,2,.LC2@toc@ha
addis 3,2,.LC3@toc@ha
addis 5,2,.LC6@toc@ha
addis 6,2,.LC7@toc@ha
addis 7,2,.LC4@toc@ha
addis 10,2,.LC5@toc@ha
xxspltib 48,0
addi 11,11,.LC2@toc@l
addi 3,3,.LC3@toc@l
addi 5,5,.LC6@toc@l
stxv 59,-80(1)
addi 6,6,.LC7@toc@l
stxv 60,-64(1)
stxv 63,-16(1)
addi 7,7,.LC4@toc@l
xxspltib 59,16
lxv 44,0(11)
xxspltib 60,32
lxv 45,0(3)
lxv 63,0(5)
xxlor 47,48,48
lxv 46,0(6)
addi 10,10,.LC5@toc@l
stxv 61,-48(1)
stxv 62,-32(1)
xxspltw 9,9,0
lxv 61,0(7)
lxv 62,0(10)
li 7,0
mr 10,4
vextsb2w 27,27
vextsb2w 28,28
stxv 57,-112(1)
stxv 58,-96(1)
.p2align 4,,15
.L5:
lxv 0,0(10)
addi 7,7,32
addi 10,10,256
cmpd 7,8,7
xvabssp 34,0
lxv 0,-240(10)
xvabssp 42,0
lxv 0,-224(10)
xvabssp 49,0
lxv 0,-208(10)
vpermr 26,10,2,12
vpermr 2,10,2,13
xvabssp 35,0
lxv 0,-192(10)
xvaddsp 34,58,34
xvabssp 36,0
lxv 0,-176(10)
vpermr 10,3,17,12
vpermr 3,3,17,13
xvabssp 33,0
lxv 0,-160(10)
xvaddsp 10,42,35
xvabssp 50,0
lxv 0,-144(10)
vpermr 17,1,4,12
vpermr 4,1,4,13
xvabssp 37,0
lxv 0,-128(10)
xvaddsp 36,49,36
xvabssp 38,0
lxv 0,-112(10)
vpermr 1,5,18,12
vpermr 5,5,18,13
xvabssp 43,0
lxv 0,-96(10)
xvaddsp 12,33,37
xvabssp 51,0
lxv 0,-80(10)
vpermr 18,11,6,12
vpermr 6,11,6,13
xvabssp 39,0
lxv 0,-64(10)
xvaddsp 38,50,38
xvabssp 40,0
lxv 0,-48(10)
vpermr 11,7,19,12
vpermr 7,7,19,13
xvabssp 32,0
lxv 0,-32(10)
xvaddsp 11,43,39
xvcmpgtsp 39,34,10
xvcmpgtsp 43,36,12
xvabssp 57,0
lxv 0,-16(10)
vpermr 19,0,8,12
vpermr 8,0,8,13
xxsel 10,34,10,39
xxsel 12,36,12,43
xxsel 39,61,62,39
xxsel 43,63,46,43
xvabssp 41,0
xvaddsp 40,51,40
vpermr 0,9,25,12
vpermr 9,9,25,13
xvaddsp 0,32,41
xvcmpgtsp 41,38,11
xvcmpgtsp 32,10,12
xvcmpgtsp 42,40,0
xxsel 11,38,11,41
xxsel 12,10,12,32
xxsel 43,39,43,32
xxsel 41,61,62,41
xxsel 0,40,0,42
xxsel 42,63,46,42
xvcmpgtsp 33,11,0
xxsel 0,11,0,33
xxsel 33,41,42,33
xvcmpgtsp 32,12,0
vadduwm 1,1,27
xxsel 0,12,0,32
xxsel 32,43,33,32
xvcmpgtsp 33,9,0
vadduwm 0,0,15
vadduwm 15,15,28
xxsel 48,48,32,33
xxsel 9,9,0,33
bgt 7,.L5
xxsldwi 11,9,9,3
xxsldwi 12,9,9,2
li 10,0
li 3,12
xxsldwi 0,9,9,1
xscvspdp 9,9
vextuwrx 6,10,16
li 10,4
xscvspdp 11,11
xscvspdp 12,12
xscvspdp 0,0
vextuwrx 5,10,16
li 10,8
vextuwrx 7,10,16
vextuwrx 10,3,16
rldicl 12,5,0,32
rldicl 3,6,0,32
rldicl 11,7,0,32
rldicl 0,10,0,32
fcmpu 7,11,12
fmr 10,0
beq 7,.L55
bng 7,.L8
mr 3,12
fmr 11,12
.L8:
fcmpu 7,0,9
bne 7,.L11
cmplw 7,7,10
ble 7,.L12
mr 7,10
.L12:
rldicl 11,7,0,32
.L13:
fcmpu 7,11,10
beq 7,.L56
bng 7,.L17
mr 3,11
fmr 11,10
.L17:
cmpd 7,9,8
ble 7,.L19
addi 7,8,1
sldi 10,8,1
subf 6,8,9
cmpd 7,7,9
sldi 10,10,2
mtctr 6
add 4,4,10
bgt 7,.L37
li 10,-1
rldicr 10,10,0,0
cmpd 7,9,10
beq 7,.L37
.p2align 4,,15
.L21:
lfs 0,0(4)
lfs 12,4(4)
addi 4,4,8
fabs 0,0
fabs 12,12
fadds 0,0,12
fcmpu 7,11,0
bng 7,.L20
fmr 11,0
mr 3,8
.L20:
addi 8,8,1
bdnz .L21
.L19:
lxv 57,-112(1)
lxv 58,-96(1)
addi 3,3,1
lxv 59,-80(1)
lxv 60,-64(1)
lxv 61,-48(1)
lxv 62,-32(1)
lxv 63,-16(1)
blr
.p2align 4,,15
.L55:
cmplw 7,6,5
ble 7,.L7
mr 6,5
.L7:
rldicl 3,6,0,32
b .L8
.p2align 4,,15
.L29:
li 3,1
blr
.p2align 4,,15
.L11:
bng 7,.L13
mr 11,0
fmr 10,9
b .L13
.p2align 4,,15
.L56:
cmpd 7,3,11
ble 7,.L17
mr 3,11
b .L17
.L37:
li 9,1
mtctr 9
b .L21
.L43:
li 9,1
mtctr 9
b .L44
.long 0
.byte 0,0,0,0,0,0,0,0
.size icamin_k,.-icamin_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.byte 0
.byte 1
.byte 2
.byte 3
.byte 8
.byte 9
.byte 10
.byte 11
.byte 16
.byte 17
.byte 18
.byte 19
.byte 24
.byte 25
.byte 26
.byte 27
.LC3:
.byte 4
.byte 5
.byte 6
.byte 7
.byte 12
.byte 13
.byte 14
.byte 15
.byte 20
.byte 21
.byte 22
.byte 23
.byte 28
.byte 29
.byte 30
.byte 31
.LC4:
.long 0
.long 1
.long 2
.long 3
.LC5:
.long 4
.long 5
.long 6
.long 7
.LC6:
.long 8
.long 9
.long 10
.long 11
.LC7:
.long 12
.long 13
.long 14
.long 15
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,434 @@
/* .file "isamax.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl isamax_k
.type isamax_k, @function
*/
#define ASSEMBLER
#include "common.h"
PROLOGUE
isamax_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry isamax_k,.-isamax_k
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
li 3,0
blelr 7
cmpdi 7,5,1
beq 7,.L69
rldicr. 7,11,0,61
beq 0,.L40
sldi 3,5,1
xxlxor 0,0,0
sldi 6,5,2
add 3,3,5
sldi 0,5,4
sldi 3,3,2
sldi 5,5,3
mr 9,4
li 8,0
li 10,0
.p2align 4,,15
.L31:
lfs 12,0(9)
fabs 12,12
fcmpu 7,12,0
bng 7,.L23
fmr 0,12
mr 8,10
.L23:
lfsx 12,9,6
fabs 12,12
fcmpu 7,12,0
bng 7,.L25
fmr 0,12
addi 8,10,1
.L25:
lfsx 12,9,5
fabs 12,12
fcmpu 7,12,0
bng 7,.L27
fmr 0,12
addi 8,10,2
.L27:
lfsx 12,9,3
add 9,9,0
fabs 12,12
fcmpu 7,12,0
bng 7,.L29
fmr 0,12
addi 8,10,3
.L29:
addi 10,10,4
cmpd 7,7,10
bgt 7,.L31
addi 7,7,-1
srdi 7,7,2
addi 7,7,1
sldi 9,7,2
mulld 7,6,7
cmpd 7,11,9
ble 7,.L67
.L22:
addi 10,9,1
sldi 7,7,2
cmpd 7,10,11
subf 10,9,11
mtctr 10
add 4,4,7
bgt 7,.L54
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L54
.p2align 4,,15
.L35:
lfs 12,0(4)
add 4,4,6
fabs 12,12
fcmpu 7,12,0
bng 7,.L33
fmr 0,12
mr 8,9
.L33:
addi 9,9,1
bdnz .L35
.L67:
addi 3,8,1
blr
.p2align 4,,15
.L36:
li 3,0
blr
.p2align 4,,15
.L69:
rldicr. 10,11,0,57
bne 0,.L70
addi 7,10,1
sldi 9,10,2
xxlxor 12,12,12
cmpd 7,7,11
add 4,4,9
subf 9,10,11
li 8,0
mtctr 9
bgt 7,.L60
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L60
.p2align 4,,15
.L61:
lfs 0,0(4)
addi 4,4,4
fabs 0,0
fcmpu 7,0,12
bng 7,.L63
fmr 12,0
mr 8,10
.L63:
addi 10,10,1
bdnz .L61
b .L67
.p2align 4,,15
.L70:
li 0,-64
std 31,-8(1)
addis 3,2,.LC2@toc@ha
vspltisw 18,0
vspltisw 12,0
addis 5,2,.LC3@toc@ha
addis 6,2,.LC6@toc@ha
stvx 29,1,0
li 0,-48
addis 8,2,.LC7@toc@ha
xxlor 35,50,50
addi 3,3,.LC2@toc@l
addi 5,5,.LC3@toc@l
stvx 30,1,0
addi 6,6,.LC6@toc@l
li 0,-32
addi 8,8,.LC7@toc@l
lxvd2x 51,0,3
lxvd2x 34,0,5
addis 7,2,.LC4@toc@ha
stvx 31,1,0
lxvd2x 47,0,6
addis 9,2,.LC5@toc@ha
addi 7,7,.LC4@toc@l
lxvd2x 48,0,8
addi 9,9,.LC5@toc@l
vspltisw 17,8
vadduwm 17,17,17
lxvd2x 36,0,7
li 7,0
lxvd2x 37,0,9
mr 9,4
.p2align 4,,15
.L5:
addi 5,9,16
addi 6,9,32
lxvd2x 41,0,9
vadduwm 31,3,15
addi 8,9,64
addi 31,9,48
addi 12,9,80
addi 3,9,96
lxvd2x 5,0,5
lxvd2x 43,0,6
addi 5,9,112
addi 6,9,128
lxvd2x 1,0,8
lxvd2x 9,0,31
addi 8,9,160
addi 31,9,144
lxvd2x 6,0,12
lxvd2x 13,0,3
addi 12,9,176
addi 3,9,192
lxvd2x 11,0,5
lxvd2x 2,0,6
xvabssp 41,41
addi 5,9,208
addi 6,9,224
lxvd2x 3,0,8
lxvd2x 7,0,31
addi 8,9,240
lxvd2x 10,0,12
lxvd2x 4,0,3
xvabssp 43,43
xvabssp 5,5
addi 7,7,64
lxvd2x 8,0,5
lxvd2x 0,0,6
xvabssp 9,9
xvabssp 1,1
cmpd 7,10,7
addi 9,9,256
lxvd2x 12,0,8
xvabssp 6,6
xvabssp 13,13
xvabssp 11,11
xvabssp 2,2
xvabssp 7,7
xvabssp 3,3
xvabssp 10,10
xvabssp 4,4
xvabssp 8,8
xvabssp 0,0
xvabssp 12,12
xvcmpgtsp 32,5,41
xvcmpgtsp 61,9,43
xvcmpgtsp 45,6,1
xvcmpgtsp 62,11,13
xvcmpgtsp 38,7,2
xvcmpgtsp 46,10,3
xvcmpgtsp 40,8,4
xvcmpgtsp 39,12,0
xxsel 5,41,5,32
xxsel 32,51,34,32
xxsel 9,43,9,61
xxsel 6,1,6,45
xxsel 11,13,11,62
xxsel 43,51,34,45
xxsel 7,2,7,38
xvcmpgtsp 41,9,5
xxsel 10,3,10,46
xvcmpgtsp 45,11,6
xxsel 8,4,8,40
xxsel 62,36,37,62
xxsel 0,0,12,39
xvcmpgtsp 42,10,7
xxsel 61,36,37,61
xxsel 40,51,34,40
xvcmpgtsp 33,0,8
xxsel 39,36,37,39
xxsel 38,51,34,38
xxsel 46,36,37,46
xxsel 9,5,9,41
xxsel 41,32,61,41
xxsel 12,6,11,45
xxsel 45,43,62,45
xxsel 11,7,10,42
xvcmpgtsp 32,12,9
vadduwm 13,13,17
xxsel 42,38,46,42
xxsel 0,8,0,33
xxsel 33,40,39,33
xvcmpgtsp 43,0,11
vadduwm 1,1,17
xxsel 12,9,12,32
xxsel 32,41,45,32
vadduwm 0,3,0
vadduwm 3,3,16
xxsel 0,11,0,43
xxsel 33,42,33,43
xvcmpgtsp 45,0,12
vadduwm 1,31,1
xxsel 0,12,0,45
xxsel 32,32,33,45
xvcmpgtsp 33,0,44
xxsel 50,50,32,33
xxsel 44,44,0,33
bgt 7,.L5
xxsldwi 12,44,44,1
xscvspdp 10,44
vspltw 0,18,0
xxsldwi 0,44,44,3
xscvspdp 12,12
mfvsrwz 3,50
mfvsrwz 6,32
vspltw 0,18,3
xscvspdp 0,0
xxsldwi 44,44,44,2
mfvsrwz 7,32
vspltw 0,18,2
xscvspdp 44,44
mfvsrwz 9,32
fcmpu 7,12,10
rldicl 8,3,0,32
rldicl 31,6,0,32
fmr 11,0
rldicl 0,7,0,32
rldicl 5,9,0,32
beq 7,.L71
bnl 7,.L8
fmr 12,10
mr 8,31
.L8:
xscmpudp 7,0,44
bne 7,.L11
cmplw 7,7,9
ble 7,.L12
mr 7,9
.L12:
rldicl 5,7,0,32
.L13:
fcmpu 7,12,11
beq 7,.L72
bnl 7,.L17
fmr 12,11
mr 8,5
.L17:
cmpd 7,11,10
ble 7,.L16
addi 7,10,1
sldi 9,10,2
cmpd 7,7,11
add 4,4,9
subf 9,10,11
mtctr 9
bgt 7,.L53
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L53
.p2align 4,,15
.L21:
lfs 0,0(4)
addi 4,4,4
fabs 0,0
fcmpu 7,0,12
bng 7,.L19
fmr 12,0
mr 8,10
.L19:
addi 10,10,1
bdnz .L21
.L16:
li 0,-64
ld 31,-8(1)
addi 3,8,1
lvx 29,1,0
li 0,-48
lvx 30,1,0
li 0,-32
lvx 31,1,0
blr
.p2align 4,,15
.L71:
cmplw 7,3,6
ble 7,.L7
mr 3,6
.L7:
rldicl 8,3,0,32
b .L8
.p2align 4,,15
.L40:
xxlxor 0,0,0
sldi 6,5,2
li 8,0
li 9,0
b .L22
.p2align 4,,15
.L11:
blt 7,.L39
mr 5,0
b .L13
.p2align 4,,15
.L72:
cmpd 7,8,5
ble 7,.L17
mr 8,5
b .L17
.p2align 4,,15
.L39:
xscpsgndp 11,44,44
b .L13
.L53:
li 9,1
mtctr 9
b .L21
.L54:
li 10,1
mtctr 10
b .L35
.L60:
li 9,1
mtctr 9
b .L61
.long 0
.byte 0,0,0,0,0,1,0,0
.size isamax_k,.-isamax_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.long 0
.long 1
.long 2
.long 3
.LC3:
.long 4
.long 5
.long 6
.long 7
.LC4:
.long 8
.long 9
.long 10
.long 11
.LC5:
.long 12
.long 13
.long 14
.long 15
.LC6:
.long 32
.long 32
.long 32
.long 32
.LC7:
.long 64
.long 64
.long 64
.long 64
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,397 @@
.file "isamax.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl isamax_k
.type isamax_k, @function
isamax_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry isamax_k,.-isamax_k
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
li 3,0
blelr 7
cmpdi 7,5,1
beq 7,.L69
rldicr. 7,11,0,61
beq 0,.L40
sldi 10,5,1
sldi 6,5,2
sldi 0,5,4
sldi 3,5,3
mr 9,4
xxlxor 0,0,0
li 8,0
add 5,10,5
li 10,0
sldi 5,5,2
.p2align 4,,15
.L31:
lfs 12,0(9)
fabs 12,12
fcmpu 7,12,0
bng 7,.L23
fmr 0,12
mr 8,10
.L23:
lfsx 12,9,6
fabs 12,12
fcmpu 7,12,0
bng 7,.L25
fmr 0,12
addi 8,10,1
.L25:
lfsx 12,9,3
fabs 12,12
fcmpu 7,12,0
bng 7,.L27
fmr 0,12
addi 8,10,2
.L27:
lfsx 12,9,5
add 9,9,0
fabs 12,12
fcmpu 7,12,0
bng 7,.L29
fmr 0,12
addi 8,10,3
.L29:
addi 10,10,4
cmpd 7,7,10
bgt 7,.L31
addi 7,7,-1
srdi 7,7,2
addi 7,7,1
sldi 9,7,2
mulld 7,6,7
cmpd 7,11,9
ble 7,.L67
.L22:
addi 10,9,1
sldi 7,7,2
subf 5,9,11
cmpd 7,10,11
mtctr 5
add 4,4,7
bgt 7,.L54
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L54
.p2align 4,,15
.L35:
lfs 12,0(4)
add 4,4,6
fabs 12,12
fcmpu 7,12,0
bng 7,.L33
fmr 0,12
mr 8,9
.L33:
addi 9,9,1
bdnz .L35
.L67:
addi 3,8,1
blr
.p2align 4,,15
.L36:
li 3,0
blr
.p2align 4,,15
.L69:
rldicr. 10,11,0,57
bne 0,.L70
addi 7,10,1
sldi 9,10,2
subf 6,10,11
li 8,0
xxlxor 12,12,12
cmpd 7,7,11
mtctr 6
add 4,4,9
bgt 7,.L60
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L60
.p2align 4,,15
.L61:
lfs 0,0(4)
addi 4,4,4
fabs 0,0
fcmpu 7,0,12
bng 7,.L63
fmr 12,0
mr 8,10
.L63:
addi 10,10,1
bdnz .L61
b .L67
.p2align 4,,15
.L70:
addis 6,2,.LC2@toc@ha
addis 7,2,.LC3@toc@ha
addis 8,2,.LC4@toc@ha
addis 9,2,.LC5@toc@ha
xxspltib 46,0
stxv 61,-48(1)
stxv 62,-32(1)
addi 6,6,.LC2@toc@l
addi 7,7,.LC3@toc@l
stxv 63,-16(1)
xxspltib 61,32
xxspltib 63,16
xxspltib 62,64
addi 8,8,.LC4@toc@l
addi 9,9,.LC5@toc@l
lxv 47,0(6)
xxspltib 34,0
lxv 48,0(7)
xxlor 51,46,46
lxv 49,0(8)
lxv 50,0(9)
li 8,0
mr 9,4
vextsb2w 29,29
vextsb2w 31,31
vextsb2w 30,30
stxv 59,-80(1)
stxv 60,-64(1)
.p2align 4,,15
.L5:
lxv 0,0(9)
vadduwm 27,19,29
lxv 12,240(9)
addi 8,8,64
addi 9,9,256
cmpd 7,10,8
xvabssp 44,0
lxv 0,-240(9)
xvabssp 12,12
xvabssp 5,0
lxv 0,-224(9)
xvabssp 32,0
lxv 0,-208(9)
xvcmpgtsp 35,5,44
xvabssp 9,0
lxv 0,-192(9)
xxsel 5,44,5,35
xxsel 35,47,48,35
xvabssp 1,0
lxv 0,-176(9)
xvcmpgtsp 60,9,32
xvabssp 6,0
lxv 0,-160(9)
xxsel 9,32,9,60
xxsel 60,49,50,60
xvabssp 13,0
lxv 0,-144(9)
xvcmpgtsp 42,9,5
xvcmpgtsp 37,6,1
xvabssp 11,0
lxv 0,-128(9)
xxsel 9,5,9,42
xxsel 42,35,60,42
xxsel 6,1,6,37
xxsel 37,47,48,37
xvabssp 2,0
lxv 0,-112(9)
xvcmpgtsp 36,11,13
xvabssp 7,0
lxv 0,-96(9)
xxsel 11,13,11,36
xxsel 36,49,50,36
xvabssp 3,0
lxv 0,-80(9)
xvcmpgtsp 45,11,6
xvcmpgtsp 39,7,2
xvabssp 10,0
lxv 0,-64(9)
xxsel 7,2,7,39
xxsel 39,47,48,39
xvabssp 4,0
lxv 0,-48(9)
xvcmpgtsp 38,10,3
xvabssp 8,0
lxv 0,-32(9)
xxsel 10,3,10,38
xxsel 38,49,50,38
xvabssp 0,0
xvcmpgtsp 43,10,7
xvcmpgtsp 41,8,4
xvcmpgtsp 40,12,0
xxsel 8,4,8,41
xxsel 41,47,48,41
xxsel 0,0,12,40
xxsel 12,6,11,45
xxsel 11,7,10,43
xxsel 45,37,36,45
xvcmpgtsp 33,0,8
xvcmpgtsp 32,12,9
vadduwm 13,13,31
xxsel 40,49,50,40
xxsel 43,39,38,43
xxsel 0,8,0,33
xxsel 12,9,12,32
xxsel 33,41,40,33
xxsel 32,42,45,32
xvcmpgtsp 44,0,11
vadduwm 1,1,31
vadduwm 0,19,0
vadduwm 19,19,30
xxsel 0,11,0,44
xxsel 33,43,33,44
xvcmpgtsp 45,0,12
vadduwm 1,27,1
xxsel 0,12,0,45
xxsel 32,32,33,45
xvcmpgtsp 33,0,34
xxsel 46,46,32,33
xxsel 34,34,0,33
bgt 7,.L5
xxsldwi 12,34,34,3
xxsldwi 11,34,34,2
li 9,0
li 8,12
xxsldwi 0,34,34,1
xscvspdp 34,34
vextuwrx 3,9,14
li 9,4
xscvspdp 12,12
xscvspdp 11,11
xscvspdp 0,0
vextuwrx 6,9,14
li 9,8
vextuwrx 7,9,14
vextuwrx 9,8,14
rldicl 12,6,0,32
rldicl 8,3,0,32
rldicl 0,7,0,32
rldicl 5,9,0,32
fcmpu 7,12,11
fmr 10,0
beq 7,.L71
bnl 7,.L8
mr 8,12
fmr 12,11
.L8:
xscmpudp 7,0,34
bne 7,.L11
cmplw 7,7,9
ble 7,.L12
mr 7,9
.L12:
rldicl 5,7,0,32
.L13:
fcmpu 7,12,10
beq 7,.L72
bnl 7,.L17
mr 8,5
fmr 12,10
.L17:
cmpd 7,11,10
ble 7,.L16
addi 7,10,1
sldi 9,10,2
subf 6,10,11
cmpd 7,7,11
mtctr 6
add 4,4,9
bgt 7,.L53
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L53
.p2align 4,,15
.L21:
lfs 0,0(4)
addi 4,4,4
fabs 0,0
fcmpu 7,0,12
bng 7,.L19
fmr 12,0
mr 8,10
.L19:
addi 10,10,1
bdnz .L21
.L16:
lxv 59,-80(1)
lxv 60,-64(1)
addi 3,8,1
lxv 61,-48(1)
lxv 62,-32(1)
lxv 63,-16(1)
blr
.p2align 4,,15
.L71:
cmplw 7,3,6
ble 7,.L7
mr 3,6
.L7:
rldicl 8,3,0,32
b .L8
.p2align 4,,15
.L40:
sldi 6,5,2
li 8,0
li 9,0
xxlxor 0,0,0
b .L22
.p2align 4,,15
.L11:
blt 7,.L39
mr 5,0
b .L13
.p2align 4,,15
.L72:
cmpd 7,8,5
ble 7,.L17
mr 8,5
b .L17
.p2align 4,,15
.L39:
xscpsgndp 10,34,34
b .L13
.L53:
li 9,1
mtctr 9
b .L21
.L54:
li 10,1
mtctr 10
b .L35
.L60:
li 9,1
mtctr 9
b .L61
.long 0
.byte 0,0,0,0,0,0,0,0
.size isamax_k,.-isamax_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.long 0
.long 1
.long 2
.long 3
.LC3:
.long 4
.long 5
.long 6
.long 7
.LC4:
.long 8
.long 9
.long 10
.long 11
.LC5:
.long 12
.long 13
.long 14
.long 15
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,417 @@
/* .file "isamin.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl isamin_k
.type isamin_k, @function
*/
#define ASSEMBLER
#include "common.h"
PROLOGUE
isamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry isamin_k,.-isamin_k
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
li 3,0
blelr 7
lfs 0,0(4)
li 0,-48
cmpdi 7,5,1
stvx 30,1,0
li 0,-32
stvx 31,1,0
fabs 0,0
beq 7,.L62
rldicr. 6,11,0,61
beq 0,.L40
sldi 0,5,1
sldi 12,5,2
std 31,-8(1)
add 0,0,5
neg 31,5
sldi 3,5,4
sldi 0,0,2
add 7,4,12
sldi 31,31,2
sldi 5,5,3
li 9,0
li 10,0
b .L24
.p2align 4,,15
.L41:
mr 10,9
.L25:
fmr 0,12
add 7,7,3
.L24:
lfs 12,0(7)
fabs 12,12
fcmpu 7,12,0
bnl 7,.L26
fmr 0,12
addi 10,9,1
.L26:
add 8,31,7
lfsx 12,8,5
fabs 12,12
fcmpu 7,12,0
bnl 7,.L28
fmr 0,12
addi 10,9,2
.L28:
lfsx 12,8,0
fabs 12,12
fcmpu 7,12,0
bnl 7,.L30
fmr 0,12
addi 10,9,3
.L30:
addi 9,9,4
cmpd 7,6,9
ble 7,.L63
lfsx 12,8,3
fabs 12,12
fcmpu 7,12,0
blt 7,.L41
fmr 12,0
b .L25
.p2align 4,,15
.L36:
li 3,0
blr
.p2align 4,,15
.L63:
addi 6,6,-1
ld 31,-8(1)
srdi 6,6,2
addi 6,6,1
sldi 9,6,2
mulld 6,12,6
cmpd 7,11,9
ble 7,.L33
.L23:
addi 8,9,1
sldi 6,6,2
cmpd 7,8,11
subf 8,9,11
mtctr 8
add 4,4,6
bgt 7,.L52
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L52
.p2align 4,,15
.L35:
lfs 12,0(4)
add 4,4,12
fabs 12,12
fcmpu 7,12,0
bnl 7,.L34
fmr 0,12
mr 10,9
.L34:
addi 9,9,1
bdnz .L35
.L33:
li 0,-48
addi 3,10,1
lvx 30,1,0
li 0,-32
lvx 31,1,0
blr
.p2align 4,,15
.L62:
rldicr. 8,11,0,57
li 10,0
bne 0,.L64
.L4:
addi 7,8,1
sldi 9,8,2
cmpd 7,7,11
add 4,4,9
subf 9,8,11
mtctr 9
bgt 7,.L51
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L51
.p2align 4,,15
.L22:
lfs 12,0(4)
addi 4,4,4
fabs 12,12
fcmpu 7,0,12
bng 7,.L21
fmr 0,12
mr 10,8
.L21:
addi 8,8,1
bdnz .L22
li 0,-48
addi 3,10,1
lvx 30,1,0
li 0,-32
lvx 31,1,0
blr
.p2align 4,,15
.L64:
lxvd2x 4,0,4
addis 10,2,.LC2@toc@ha
addis 5,2,.LC3@toc@ha
std 31,-8(1)
vspltisw 2,0
addi 10,10,.LC2@toc@l
addis 7,2,.LC4@toc@ha
addis 9,2,.LC5@toc@ha
addis 6,2,.LC6@toc@ha
lxvd2x 51,0,10
addis 10,2,.LC7@toc@ha
addi 7,7,.LC4@toc@l
addi 9,9,.LC5@toc@l
addi 5,5,.LC3@toc@l
xvabssp 4,4
addi 6,6,.LC6@toc@l
addi 10,10,.LC7@toc@l
lxvd2x 36,0,7
vspltisw 18,8
lxvd2x 37,0,9
lxvd2x 35,0,5
mr 9,4
li 7,0
lxvd2x 48,0,6
lxvd2x 49,0,10
vadduwm 18,18,18
xxlor 38,51,51
xxlor 40,4,4
b .L6
.p2align 4,,15
.L65:
lxvd2x 5,0,9
xvabssp 40,5
.L6:
addi 5,9,16
addi 6,9,32
vadduwm 14,2,16
addi 10,9,64
addi 12,9,48
addi 31,9,80
addi 3,9,96
lxvd2x 5,0,5
lxvd2x 42,0,6
addi 5,9,112
addi 6,9,128
lxvd2x 44,0,10
lxvd2x 9,0,12
addi 10,9,160
addi 12,9,144
lxvd2x 6,0,31
lxvd2x 1,0,3
addi 31,9,176
addi 3,9,192
lxvd2x 11,0,5
lxvd2x 13,0,6
addi 5,9,208
addi 6,9,224
lxvd2x 2,0,10
lxvd2x 7,0,12
addi 10,9,240
lxvd2x 10,0,31
lxvd2x 3,0,3
xvabssp 42,42
xvabssp 5,5
addi 7,7,64
lxvd2x 8,0,5
lxvd2x 0,0,6
xvabssp 44,44
xvabssp 9,9
cmpd 7,8,7
addi 9,9,256
lxvd2x 12,0,10
xvabssp 6,6
xvabssp 1,1
xvabssp 11,11
xvabssp 13,13
xvabssp 7,7
xvabssp 2,2
xvabssp 10,10
xvabssp 3,3
xvabssp 8,8
xvabssp 0,0
xvabssp 12,12
xvcmpgtsp 32,40,5
xvcmpgtsp 62,42,9
xvcmpgtsp 45,44,6
xvcmpgtsp 63,1,11
xvcmpgtsp 39,13,7
xvcmpgtsp 47,2,10
xvcmpgtsp 41,3,8
xvcmpgtsp 33,0,12
xxsel 5,40,5,32
xxsel 32,38,35,32
xxsel 9,42,9,62
xxsel 6,44,6,45
xxsel 11,1,11,63
xxsel 44,38,35,45
xxsel 7,13,7,39
xvcmpgtsp 42,5,9
xxsel 10,2,10,47
xvcmpgtsp 45,6,11
xxsel 8,3,8,41
xxsel 63,36,37,63
xxsel 0,0,12,33
xvcmpgtsp 43,7,10
xxsel 40,36,37,33
xxsel 62,36,37,62
xvcmpgtsp 33,8,0
xxsel 41,38,35,41
xxsel 39,38,35,39
xxsel 47,36,37,47
xxsel 9,5,9,42
xxsel 42,32,62,42
xxsel 12,6,11,45
xxsel 45,44,63,45
xxsel 11,7,10,43
xvcmpgtsp 32,9,12
vadduwm 13,13,18
xxsel 43,39,47,43
xxsel 0,8,0,33
xxsel 33,41,40,33
xvcmpgtsp 44,11,0
vadduwm 1,1,18
xxsel 12,9,12,32
xxsel 32,42,45,32
vadduwm 0,2,0
vadduwm 2,2,17
xxsel 0,11,0,44
xxsel 33,43,33,44
xvcmpgtsp 45,12,0
vadduwm 1,14,1
xxsel 0,12,0,45
xxsel 32,32,33,45
xvcmpgtsp 33,4,0
xxsel 51,51,32,33
xxsel 4,4,0,33
bgt 7,.L65
xxsldwi 0,4,4,1
xscvspdp 10,4
vspltw 0,19,0
xxsldwi 12,4,4,3
xscvspdp 0,0
mfvsrwz 3,51
mfvsrwz 6,32
vspltw 0,19,3
xscvspdp 12,12
xxsldwi 4,4,4,2
mfvsrwz 7,32
vspltw 0,19,2
xscvspdp 4,4
mfvsrwz 9,32
fcmpu 7,0,10
rldicl 10,3,0,32
rldicl 31,6,0,32
fmr 11,12
rldicl 5,7,0,32
rldicl 0,9,0,32
beq 7,.L66
bng 7,.L9
fmr 0,10
mr 10,31
.L9:
fcmpu 7,12,4
bne 7,.L12
cmplw 7,7,9
ble 7,.L13
mr 7,9
.L13:
rldicl 5,7,0,32
.L14:
fcmpu 7,0,11
beq 7,.L67
bng 7,.L19
fmr 0,11
mr 10,5
.L19:
cmpd 7,11,8
ld 31,-8(1)
bgt 7,.L4
b .L33
.p2align 4,,15
.L66:
cmplw 7,3,6
ble 7,.L8
mr 3,6
.L8:
rldicl 10,3,0,32
b .L9
.p2align 4,,15
.L40:
sldi 12,5,2
li 10,0
li 9,0
b .L23
.p2align 4,,15
.L12:
bng 7,.L14
fmr 11,4
mr 5,0
b .L14
.p2align 4,,15
.L67:
cmpd 7,10,5
ble 7,.L19
mr 10,5
b .L19
.L51:
li 9,1
mtctr 9
b .L22
.L52:
li 8,1
mtctr 8
b .L35
.long 0
.byte 0,0,0,0,0,1,0,0
.size isamin_k,.-isamin_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.long 0
.long 1
.long 2
.long 3
.LC3:
.long 4
.long 5
.long 6
.long 7
.LC4:
.long 8
.long 9
.long 10
.long 11
.LC5:
.long 12
.long 13
.long 14
.long 15
.LC6:
.long 32
.long 32
.long 32
.long 32
.LC7:
.long 64
.long 64
.long 64
.long 64
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

View File

@ -0,0 +1,382 @@
.file "isamin.c"
.abiversion 2
.section ".text"
.align 2
.p2align 4,,15
.globl isamin_k
.type isamin_k, @function
isamin_k:
.LCF0:
0: addis 2,12,.TOC.-.LCF0@ha
addi 2,2,.TOC.-.LCF0@l
.localentry isamin_k,.-isamin_k
mr. 11,3
ble 0,.L36
cmpdi 7,5,0
li 3,0
blelr 7
lfs 0,0(4)
cmpdi 7,5,1
stxv 61,-64(1)
stxv 62,-48(1)
stxv 63,-32(1)
fabs 0,0
beq 7,.L62
rldicr. 6,11,0,61
beq 0,.L40
sldi 8,5,1
sldi 0,5,2
neg 12,5
std 31,-8(1)
sldi 3,5,4
sldi 31,5,3
li 9,0
li 10,0
add 5,8,5
add 7,4,0
sldi 12,12,2
sldi 5,5,2
b .L24
.p2align 4,,15
.L41:
mr 10,9
.L25:
add 7,7,3
fmr 0,12
.L24:
lfs 12,0(7)
fabs 12,12
fcmpu 7,12,0
bnl 7,.L26
fmr 0,12
addi 10,9,1
.L26:
add 8,7,12
lfsx 12,8,31
fabs 12,12
fcmpu 7,12,0
bnl 7,.L28
fmr 0,12
addi 10,9,2
.L28:
lfsx 12,8,5
fabs 12,12
fcmpu 7,12,0
bnl 7,.L30
fmr 0,12
addi 10,9,3
.L30:
addi 9,9,4
cmpd 7,6,9
ble 7,.L63
lfsx 12,8,3
fabs 12,12
fcmpu 7,12,0
blt 7,.L41
fmr 12,0
b .L25
.p2align 4,,15
.L36:
li 3,0
blr
.p2align 4,,15
.L63:
addi 6,6,-1
ld 31,-8(1)
srdi 6,6,2
addi 6,6,1
sldi 9,6,2
mulld 6,0,6
cmpd 7,11,9
ble 7,.L33
.L23:
addi 8,9,1
sldi 6,6,2
subf 7,9,11
cmpd 7,8,11
mtctr 7
add 4,4,6
bgt 7,.L52
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L52
.p2align 4,,15
.L35:
lfs 12,0(4)
add 4,4,0
fabs 12,12
fcmpu 7,12,0
bnl 7,.L34
fmr 0,12
mr 10,9
.L34:
addi 9,9,1
bdnz .L35
.L33:
lxv 61,-64(1)
lxv 62,-48(1)
addi 3,10,1
lxv 63,-32(1)
blr
.p2align 4,,15
.L62:
rldicr. 8,11,0,57
li 10,0
bne 0,.L64
.L4:
addi 7,8,1
sldi 9,8,2
subf 6,8,11
cmpd 7,7,11
mtctr 6
add 4,4,9
bgt 7,.L51
li 3,-1
rldicr 3,3,0,0
cmpd 7,11,3
beq 7,.L51
.p2align 4,,15
.L22:
lfs 12,0(4)
addi 4,4,4
fabs 12,12
fcmpu 7,0,12
bng 7,.L21
fmr 0,12
mr 10,8
.L21:
addi 8,8,1
bdnz .L22
lxv 61,-64(1)
lxv 62,-48(1)
addi 3,10,1
lxv 63,-32(1)
blr
.p2align 4,,15
.L64:
lxv 0,0(4)
xxspltib 47,16
addis 6,2,.LC2@toc@ha
addis 7,2,.LC3@toc@ha
addis 10,2,.LC4@toc@ha
addis 9,2,.LC5@toc@ha
xxspltib 63,32
xxspltib 46,64
addi 6,6,.LC2@toc@l
addi 10,10,.LC4@toc@l
addi 7,7,.LC3@toc@l
std 31,-8(1)
addi 9,9,.LC5@toc@l
xxspltib 50,0
vextsb2w 15,15
lxv 48,0(6)
lxv 51,0(10)
vextsb2w 31,31
vextsb2w 14,14
xvabssp 4,0
lxv 34,0(9)
lxv 49,0(7)
mr 9,4
li 10,0
xxlor 35,48,48
xxlor 40,4,4
b .L6
.p2align 4,,15
.L65:
lxv 0,0(9)
xvabssp 40,0
.L6:
lxv 0,16(9)
vadduwm 29,18,31
lxv 12,240(9)
addi 10,10,64
addi 9,9,256
cmpd 7,8,10
xvabssp 5,0
lxv 0,-224(9)
xvabssp 12,12
xvabssp 32,0
lxv 0,-208(9)
xvcmpgtsp 42,40,5
xvabssp 9,0
lxv 0,-192(9)
xxsel 5,40,5,42
xvabssp 44,0
lxv 0,-176(9)
xvcmpgtsp 62,32,9
xvabssp 6,0
lxv 0,-160(9)
xxsel 9,32,9,62
xxsel 32,35,49,42
xvabssp 1,0
lxv 0,-144(9)
xxsel 62,51,34,62
xvcmpgtsp 42,5,9
xvcmpgtsp 37,44,6
xvabssp 11,0
lxv 0,-128(9)
xxsel 9,5,9,42
xxsel 42,32,62,42
xxsel 6,44,6,37
xxsel 37,35,49,37
xvabssp 13,0
lxv 0,-112(9)
xvcmpgtsp 36,1,11
xvabssp 7,0
lxv 0,-96(9)
xxsel 11,1,11,36
xxsel 36,51,34,36
xvabssp 2,0
lxv 0,-80(9)
xvcmpgtsp 45,6,11
xvcmpgtsp 39,13,7
xvabssp 10,0
lxv 0,-64(9)
xxsel 7,13,7,39
xxsel 39,35,49,39
xvabssp 3,0
lxv 0,-48(9)
xvcmpgtsp 38,2,10
xvabssp 8,0
lxv 0,-32(9)
xxsel 10,2,10,38
xxsel 38,51,34,38
xvabssp 0,0
xvcmpgtsp 43,7,10
xvcmpgtsp 41,3,8
xvcmpgtsp 33,0,12
xxsel 8,3,8,41
xxsel 41,35,49,41
xxsel 0,0,12,33
xxsel 40,51,34,33
xxsel 12,6,11,45
xxsel 11,7,10,43
xvcmpgtsp 33,8,0
xxsel 45,37,36,45
xvcmpgtsp 32,9,12
xxsel 43,39,38,43
vadduwm 13,13,15
xxsel 0,8,0,33
xxsel 33,41,40,33
xxsel 12,9,12,32
xxsel 32,42,45,32
xvcmpgtsp 44,11,0
vadduwm 1,1,15
vadduwm 0,18,0
vadduwm 18,18,14
xxsel 0,11,0,44
xxsel 33,43,33,44
xvcmpgtsp 45,12,0
vadduwm 1,29,1
xxsel 0,12,0,45
xxsel 32,32,33,45
xvcmpgtsp 33,4,0
xxsel 48,48,32,33
xxsel 4,4,0,33
bgt 7,.L65
xxsldwi 0,4,4,3
xxsldwi 11,4,4,2
li 9,0
li 10,12
xxsldwi 12,4,4,1
xscvspdp 4,4
vextuwrx 3,9,16
li 9,4
xscvspdp 0,0
xscvspdp 11,11
xscvspdp 12,12
vextuwrx 6,9,16
li 9,8
vextuwrx 7,9,16
vextuwrx 9,10,16
rldicl 31,6,0,32
rldicl 10,3,0,32
rldicl 5,7,0,32
rldicl 0,9,0,32
fcmpu 7,0,11
fmr 10,12
beq 7,.L66
bng 7,.L9
mr 10,31
fmr 0,11
.L9:
fcmpu 7,12,4
bne 7,.L12
cmplw 7,7,9
ble 7,.L13
mr 7,9
.L13:
rldicl 5,7,0,32
.L14:
fcmpu 7,0,10
beq 7,.L67
bng 7,.L19
mr 10,5
fmr 0,10
.L19:
cmpd 7,11,8
ld 31,-8(1)
bgt 7,.L4
b .L33
.p2align 4,,15
.L66:
cmplw 7,3,6
ble 7,.L8
mr 3,6
.L8:
rldicl 10,3,0,32
b .L9
.p2align 4,,15
.L40:
sldi 0,5,2
li 10,0
li 9,0
b .L23
.p2align 4,,15
.L12:
bng 7,.L14
mr 5,0
fmr 10,4
b .L14
.p2align 4,,15
.L67:
cmpd 7,10,5
ble 7,.L19
mr 10,5
b .L19
.L51:
li 9,1
mtctr 9
b .L22
.L52:
li 8,1
mtctr 8
b .L35
.long 0
.byte 0,0,0,0,0,1,0,0
.size isamin_k,.-isamin_k
.section .rodata.cst16,"aM",@progbits,16
.align 4
.LC2:
.long 0
.long 1
.long 2
.long 3
.LC3:
.long 4
.long 5
.long 6
.long 7
.LC4:
.long 8
.long 9
.long 10
.long 11
.LC5:
.long 12
.long 13
.long 14
.long 15
.ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
.section .note.GNU-stack,"",@progbits

31
param.h
View File

@ -2636,15 +2636,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 128
/*FIXME: this should be using the cache size, but there is currently no easy way to
query that on ARM. So if getarch counted more than 8 cores we simply assume the host
is a big desktop or server with abundant cache rather than a phone or embedded device */
#if NUM_CORES > 8
#define SGEMM_DEFAULT_P 512
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 1024
#define DGEMM_DEFAULT_Q 512
#define CGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q 512
#define SGEMM_DEFAULT_Q 1024
#define DGEMM_DEFAULT_Q 512
#define CGEMM_DEFAULT_Q 512
#define ZGEMM_DEFAULT_Q 512
#else
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 160
#define CGEMM_DEFAULT_P 128
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 352
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 112
#endif
#define SGEMM_DEFAULT_R 4096
#define DGEMM_DEFAULT_R 4096