diff --git a/.travis.yml b/.travis.yml index 27ecba6c8..2b1b99b26 100644 --- a/.travis.yml +++ b/.travis.yml @@ -17,7 +17,7 @@ matrix: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" script: - set -e - - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE - make -C test $COMMON_FLAGS $BTYPE - make -C ctest $COMMON_FLAGS $BTYPE - make -C utest $COMMON_FLAGS $BTYPE @@ -25,14 +25,14 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" - # - <<: *test-ubuntu - # os: linux-ppc64le - # before_script: - # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - # env: - # # for matrix annotation only - # - TARGET_BOX=PPC64LE_LINUX - # - BTYPE="BINARY=64 USE_OPENMP=1" + - <<: *test-ubuntu + os: linux-ppc64le + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" + env: + # for matrix annotation only + - TARGET_BOX=PPC64LE_LINUX + - BTYPE="BINARY=64 USE_OPENMP=1" - <<: *test-ubuntu env: diff --git a/Makefile.system b/Makefile.system index 2cf1322a9..8843d0ad3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -322,12 +322,13 @@ CCOMMON_OPT += -DMS_ABI endif ifeq ($(C_COMPILER), GCC) -#Test for supporting MS_ABI +#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) +GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) -# GCC Majar version > 4 +# GCC Major version > 4 # It is compatible with MSVC ABI. CCOMMON_OPT += -DMS_ABI endif @@ -554,8 +555,17 @@ endif ifeq ($(ARCH), power) DYNAMIC_CORE = POWER6 DYNAMIC_CORE += POWER8 +ifneq ($(C_COMPILER), GCC) DYNAMIC_CORE += POWER9 endif +ifeq ($(C_COMPILER), GCC) +ifeq ($(GCCVERSIONGT5), 1) +DYNAMIC_CORE += POWER9 +else +$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) +endif +endif +endif # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE diff --git a/cpuid_arm64.c b/cpuid_arm64.c index e8aa29813..9e019fe3e 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -206,6 +206,33 @@ void get_subdirname(void) printf("arm64"); } +void get_cpucount(void) +{ +int n=0; + +#ifdef linux + FILE *infile; + char buffer[2048], *p,*t; + p = (char *) NULL ; + + infile = fopen("/proc/cpuinfo", "r"); + + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("processor", buffer, 9)) + n++; + } + + fclose(infile); + + printf("#define NUM_CORES %d\n",n); +#endif + +} + + + void get_cpuconfig(void) { @@ -309,6 +336,7 @@ void get_cpuconfig(void) printf("#define DTB_SIZE 4096 \n"); break; } + get_cpucount(); } @@ -351,5 +379,3 @@ void get_features(void) #endif return; } - - diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 0c4a87a5e..1dec5f4b3 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -3,7 +3,9 @@ extern gotoblas_t gotoblas_POWER6; extern gotoblas_t gotoblas_POWER8; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) extern gotoblas_t gotoblas_POWER9; +#endif extern void openblas_warning(int verbose, const char *msg); @@ -19,7 +21,9 @@ static char *corename[] = { char *gotoblas_corename(void) { if (gotoblas == &gotoblas_POWER6) return corename[1]; if (gotoblas == &gotoblas_POWER8) return corename[2]; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) if (gotoblas == &gotoblas_POWER9) return corename[3]; +#endif return corename[0]; } @@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_POWER6; if (__builtin_cpu_is("power8")) return &gotoblas_POWER8; +#if (!defined C_GCC) || (GCC_VERSION >= 60000) if (__builtin_cpu_is("power9")) return &gotoblas_POWER9; +#endif return NULL; } @@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) { { case 1: return (&gotoblas_POWER6); case 2: return (&gotoblas_POWER8); +#if (!defined C_GCC) || (GCC_VERSION >= 60000) case 3: return (&gotoblas_POWER9); +#endif default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f83def47b..7998c135a 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(CORE), GENERIC) +ifneq ($(DYNAMIC_ARCH), 1) +ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif +endif ifeq ($(CORE), HASWELL) USE_TRMM = 1 diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 43f004fbb..c08f3fb00 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ISAMAXKERNEL = isamax.c +ISAMAXKERNEL = isamax_power8.S IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c +ICAMAXKERNEL = icamax_power8.S IZAMAXKERNEL = izamax.c # -ISAMINKERNEL = isamin.c +ISAMINKERNEL = isamin_power8.S IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c +ICAMINKERNEL = icamin_power8.S IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c +CAXPYKERNEL = caxpy_power8.S ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index a570a903a..2ed843fff 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c #SMINKERNEL = ../arm/min.c #DMINKERNEL = ../arm/min.c # -ISAMAXKERNEL = isamax.c +ISAMAXKERNEL = isamax_power9.S IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c +ICAMAXKERNEL = icamax_power9.S IZAMAXKERNEL = izamax.c # -ISAMINKERNEL = isamin.c +ISAMINKERNEL = isamin_power9.S IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c +ICAMINKERNEL = icamin_power9.S IZAMINKERNEL = izamin.c # #ISMAXKERNEL = ../arm/imax.c @@ -112,7 +112,7 @@ ZASUMKERNEL = zasum.c # SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c +CAXPYKERNEL = caxpy_power9.S ZAXPYKERNEL = zaxpy.c # SCOPYKERNEL = scopy.c @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot.c DDOTKERNEL = ddot.c DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c +CDOTKERNEL = cdot_power9.S ZDOTKERNEL = zdot.c # SNRM2KERNEL = ../arm/nrm2.c diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S new file mode 100644 index 000000000..09a423571 --- /dev/null +++ b/kernel/power/caxpy_power8.S @@ -0,0 +1,574 @@ +#define ASSEMBLER +#include "common.h" +/* + .file "caxpy.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl caxpy_k + .type caxpy_k, @function +*/ + + PROLOGUE + +caxpy_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry caxpy_k,.-caxpy_k + mr. 7,3 + ble 0,.L33 + cmpdi 7,9,1 + beq 7,.L41 +.L3: + mtctr 7 + ld 7,96(1) + sldi 9,9,3 + sldi 7,7,3 + .p2align 4,,15 +.L14: + lfs 10,4(8) + lfs 11,0(8) + lfs 12,0(10) + lfs 0,4(10) + fmuls 10,2,10 +#ifdef CONJ + fmsubs 11,11,1,10 +#else + fmadds 11,11,1,10 +#endif + fadds 12,12,11 + stfs 12,0(10) + lfs 11,0(8) + lfs 12,4(8) + add 8,8,9 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfs 0,4(10) + add 10,10,7 + bdnz .L14 +.L33: + li 3,0 + blr + .p2align 4,,15 +.L41: + ld 6,96(1) + cmpdi 7,6,1 + bne 7,.L3 + rldicr. 4,7,0,59 + std 31,-8(1) + li 11,0 + bne 0,.L42 +.L4: + addi 6,11,8 + subf 0,4,7 + sldi 6,6,2 + addi 9,6,-32 + add 5,10,6 + add 3,8,9 + add 6,8,6 + subfc 5,5,3 + add 9,10,9 + subfe 5,5,5 + subfc 6,6,9 + subfe 31,31,31 + addi 6,5,1 + addi 5,31,1 + or 6,6,5 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + sradi 6,4,63 + srdi 5,7,63 + subfc 31,7,4 + adde 6,5,6 + subfic 31,0,3 + subfe 31,31,31 + xori 6,6,0x1 + neg 31,31 + and 6,6,31 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + cmpd 7,4,7 + li 6,1 + blt 7,.L43 +.L9: + addi 0,7,-1 + subf 0,4,0 + subfic 0,0,3 + subfe 31,31,31 + addi 0,31,1 + rlwinm 0,0,0,0xff + cmpwi 7,0,0 + bne 7,.L10 + sradi 0,4,63 + subfc 31,7,4 + adde 5,5,0 + rlwinm 5,5,0,0xff + cmpwi 7,5,0 + bne 7,.L10 + addi 0,6,-1 + addis 31,2,.LC3@toc@ha + std 30,-16(1) + xscvdpspn 12,1 + xscvdpspn 11,2 + srdi. 30,0,2 + addis 6,2,.LC2@toc@ha + addi 6,6,.LC2@toc@l + mtctr 30 + addi 31,31,.LC3@toc@l + lxvd2x 42,0,6 + li 5,16 + li 6,0 + lxvd2x 41,0,31 + xxspltw 12,12,0 + xxspltw 11,11,0 + xxpermdi 42,42,42,2 + xxpermdi 41,41,41,2 + beq 0,.L44 + .p2align 4,,15 +.L11: +#ifdef CONJ + lxvd2x 44,3,6 + lxvd2x 45,3,5 + lxvd2x 33,9,6 + lxvd2x 0,9,5 + xxpermdi 44,44,44,2 + xxpermdi 45,45,45,2 + xxpermdi 32,33,33,2 + xxpermdi 33,0,0,2 + vperm 11,13,12,10 + vperm 13,13,12,9 + vperm 12,1,0,10 + vperm 1,1,0,9 + xvmulsp 0,11,43 + xvmulsp 32,11,45 + xvmsubmsp 45,12,0 + xvmaddasp 32,12,43 + xvaddsp 44,32,44 + xvsubsp 32,33,45 + vmrglw 1,0,12 + vmrghw 0,0,12 +#else + lxvd2x 45,3,6 + lxvd2x 33,3,5 + lxvd2x 43,9,6 + lxvd2x 0,9,5 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,43,43,2 + xxpermdi 43,0,0,2 + vperm 12,1,13,10 + vperm 1,1,13,9 + vperm 13,11,0,10 + vperm 11,11,0,9 + xvmulsp 0,11,44 + xvmulsp 32,11,33 + xvmaddmsp 33,12,0 + xvmsubasp 32,12,44 + xvaddsp 45,32,45 + xvaddsp 32,33,43 + vmrglw 1,0,13 + vmrghw 0,0,13 +#endif + xxpermdi 0,33,33,2 + xxpermdi 32,32,32,2 + stxvd2x 0,9,6 + addi 6,6,32 + stxvd2x 32,9,5 + addi 5,5,32 + bdnz .L11 + rldicr 0,0,0,61 + ld 30,-16(1) + sldi 9,0,1 + add 4,4,0 + add 11,11,9 +.L10: + sldi 6,11,2 + addi 9,4,1 + addi 5,6,4 + cmpd 7,7,9 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,2 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,12,1,11 +#else + fmsubs 12,12,1,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + addi 6,4,2 + addi 5,9,4 + cmpd 7,7,6 + lfsx 12,8,9 + lfsx 0,10,9 + addi 6,11,4 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 + fmsubs 12,1,12,11 + fsubs 0,0,12 + stfsx 0,10,5 + ble 7,.L39 + sldi 6,6,2 + addi 4,4,3 + addi 5,6,4 + cmpd 7,7,4 + lfsx 12,8,6 + lfsx 0,10,6 + addi 9,11,6 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L39 + sldi 9,9,2 + ld 31,-8(1) + addi 7,9,4 + lfsx 12,8,9 + lfsx 0,10,9 + lfsx 11,8,7 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,7 + lfsx 0,10,7 + fmuls 2,2,11 +#ifdef CONJ + fmsubs 1,1,12,2 + fsubs 1,0,1 +#else + fmadds 1,1,12,2 + fadds 1,0,1 +#endif + stfsx 1,10,7 + b .L33 +.L43: + mr 6,0 + b .L9 +.L7: + addi 10,4,1 + cmpd 7,10,7 + subf 10,4,7 + mtctr 10 + bgt 7,.L26 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,7,10 + beq 7,.L26 + .p2align 4,,15 +.L13: + lfs 10,4(3) + lfs 11,0(3) + addi 9,9,8 + addi 3,3,8 + lfs 12,-8(9) + lfs 0,-4(9) + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,1,11,10 +#else + fmsubs 11,1,11,10 +#endif + fadds 12,12,11 + stfs 12,-8(9) + lfs 11,-8(3) + lfs 12,-4(3) + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfs 0,-4(9) + bdnz .L13 +.L39: + ld 31,-8(1) + b .L33 +.L42: +#ifdef CONJ + fneg 0,1 + xxpermdi 32,1,1,0 + addis 9,2,.LANCHOR0@toc@ha + std 28,-32(1) + sradi. 28,4,1 + addi 9,9,.LANCHOR0@toc@l + xscvdpspn 5,2 + xvcvdpsp 32,32 + lxvd2x 12,0,9 + xxpermdi 39,0,0,0 + xxspltw 5,5,0 + xvcvdpsp 39,39 +#else + fneg 0,2 + xxpermdi 39,2,2,0 + addis 9,2,.LANCHOR0@toc@ha + std 28,-32(1) + sradi. 28,4,1 + addi 9,9,.LANCHOR0@toc@l + xscvdpspn 5,1 + xvcvdpsp 39,39 + lxvd2x 12,0,9 + xxpermdi 32,0,0,0 + xxspltw 5,5,0 + xvcvdpsp 32,32 +#endif + xxpermdi 12,12,12,2 + vmrgew 7,7,0 + beq 0,.L5 + xxlnor 38,12,12 + std 29,-24(1) + std 30,-16(1) + mr 6,8 + mr 9,10 + li 29,0 + li 30,16 + li 31,32 + li 12,48 + li 0,64 + li 11,80 + li 3,96 + li 5,112 + .p2align 4,,15 +.L6: + lxvd2x 6,0,9 + lxvd2x 40,0,6 + addi 29,29,8 + lxvd2x 41,6,30 + lxvd2x 42,6,31 + cmpd 7,28,29 + lxvd2x 43,6,12 + lxvd2x 44,6,0 + lxvd2x 45,6,11 + lxvd2x 33,6,3 + lxvd2x 32,6,5 + lxvd2x 7,9,30 + addi 6,6,128 + lxvd2x 8,9,31 + lxvd2x 9,9,12 + xxpermdi 40,40,40,2 + xxpermdi 6,6,6,2 + lxvd2x 10,9,0 + lxvd2x 11,9,11 + xxpermdi 41,41,41,2 + xxpermdi 42,42,42,2 + lxvd2x 12,9,3 + lxvd2x 0,9,5 + xxpermdi 43,43,43,2 + xxpermdi 44,44,44,2 + xxpermdi 45,45,45,2 + xxpermdi 33,33,33,2 + xxpermdi 32,32,32,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + xxpermdi 10,10,10,2 + xxpermdi 11,11,11,2 + xxpermdi 12,12,12,2 + xxpermdi 0,0,0,2 +#ifndef CONJ + xvmaddasp 6,5,40 + xvmaddasp 7,5,41 + xvmaddasp 8,5,42 + xvmaddasp 9,5,43 + xvmaddasp 10,5,44 + xvmaddasp 11,5,45 + xvmaddasp 12,5,33 + xvmaddasp 0,5,32 + vperm 8,8,8,6 + vperm 9,9,9,6 + vperm 10,10,10,6 + vperm 11,11,11,6 + vperm 12,12,12,6 + vperm 13,13,13,6 + vperm 1,1,1,6 + vperm 0,0,0,6 +#endif + xvmaddasp 6,39,40 + xvmaddasp 7,39,41 + xvmaddasp 8,39,42 + xvmaddasp 9,39,43 + xvmaddasp 10,39,44 + xvmaddasp 11,39,45 + xvmaddasp 12,39,33 + xvmaddasp 0,39,32 +#ifdef CONJ + vperm 8,8,8,6 + vperm 9,9,9,6 + vperm 10,10,10,6 + vperm 11,11,11,6 + vperm 12,12,12,6 + vperm 13,13,13,6 + vperm 1,1,1,6 + vperm 0,0,0,6 + xvmaddasp 6,5,40 + xvmaddasp 7,5,41 + xvmaddasp 8,5,42 + xvmaddasp 9,5,43 + xvmaddasp 10,5,44 + xvmaddasp 11,5,45 + xvmaddasp 12,5,33 + xvmaddasp 0,5,32 +#endif + xxpermdi 6,6,6,2 + xxpermdi 7,7,7,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + stxvd2x 6,0,9 + xxpermdi 10,10,10,2 + stxvd2x 7,9,30 + xxpermdi 11,11,11,2 + stxvd2x 8,9,31 + xxpermdi 12,12,12,2 + stxvd2x 9,9,12 + xxpermdi 0,0,0,2 + stxvd2x 10,9,0 + stxvd2x 11,9,11 + stxvd2x 12,9,3 + stxvd2x 0,9,5 + addi 9,9,128 + bgt 7,.L6 + ld 29,-24(1) + ld 30,-16(1) +.L5: + cmpd 7,7,4 + ble 7,.L36 + sldi 11,4,1 + ld 28,-32(1) + b .L4 +.L36: + ld 28,-32(1) + ld 31,-8(1) + b .L33 +.L44: + li 31,1 + mtctr 31 + b .L11 +.L26: + li 10,1 + mtctr 10 + b .L13 + .long 0 + .byte 0,0,0,0,0,4,0,0 + .size caxpy_k,.-caxpy_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 7 + .byte 6 + .byte 5 + .byte 4 +.LC3: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .gnu_attribute 4, 1 + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S new file mode 100644 index 000000000..48e6e5ba3 --- /dev/null +++ b/kernel/power/caxpy_power9.S @@ -0,0 +1,538 @@ +#define ASSEMBLER +#include "common.h" + +/* + .file "caxpy.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl caxpy_k + .type caxpy_k, @function +*/ + + PROLOGUE + +caxpy_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry caxpy_k,.-caxpy_k + mr. 7,3 + ble 0,.L33 + cmpdi 7,9,1 + beq 7,.L37 +.L3: + mtctr 7 + ld 7,96(1) + sldi 9,9,3 + sldi 7,7,3 + .p2align 4,,15 +.L14: + lfs 10,4(8) + lfs 11,0(8) + lfs 12,0(10) + lfs 0,4(10) + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,11,1,10 +#else + fmsubs 11,11,1,10 +#endif + fadds 12,12,11 + stfs 12,0(10) + lfs 11,0(8) + lfs 12,4(8) + add 8,8,9 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfs 0,4(10) + add 10,10,7 + bdnz .L14 +.L33: + li 3,0 + blr + .p2align 4,,15 +.L37: + ld 6,96(1) + cmpdi 7,6,1 + bne 7,.L3 + rldicr. 4,7,0,59 + li 11,0 + bne 0,.L38 +.L4: + addi 6,11,8 + subf 0,4,7 + sldi 6,6,2 + addi 9,6,-32 + add 5,10,6 + add 6,8,6 + add 3,8,9 + add 9,10,9 + subfc 5,5,3 + subfe 5,5,5 + subfc 6,6,9 + subfe 12,12,12 + addi 6,5,1 + addi 5,12,1 + or 6,6,5 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + sradi 6,4,63 + srdi 5,7,63 + subfc 12,7,4 + adde 6,5,6 + subfic 12,0,4 + subfe 12,12,12 + xori 6,6,0x1 + neg 12,12 + and 6,6,12 + rlwinm 6,6,0,0xff + cmpwi 7,6,0 + beq 7,.L7 + cmpd 7,4,7 + li 6,1 + blt 7,.L39 +.L9: + addi 0,7,-1 + subf 0,4,0 + subfic 0,0,3 + subfe 12,12,12 + addi 0,12,1 + rlwinm 0,0,0,0xff + cmpwi 7,0,0 + bne 7,.L10 + sradi 0,4,63 + subfc 12,7,4 + adde 5,5,0 + rlwinm 5,5,0,0xff + cmpwi 7,5,0 + bne 7,.L10 + xscvdpspn 0,1 + xscvdpspn 12,2 + addi 0,6,-1 + std 31,-8(1) + addis 12,2,.LC2@toc@ha + addis 6,2,.LC3@toc@ha + li 5,16 + srdi. 31,0,2 + addi 6,6,.LC3@toc@l + addi 12,12,.LC2@toc@l + mtctr 31 + lxv 41,0(6) + lxv 42,0(12) + li 6,0 + xxspltw 0,0,0 + xxspltw 12,12,0 + beq 0,.L40 + .p2align 4,,15 +.L11: +#ifdef CONJ + lxvx 33,3,5 + lxvx 44,3,6 + lxvx 43,9,6 + lxvx 32,9,5 + vperm 13,1,12,10 + vperm 12,1,12,9 + vperm 8,0,11,10 + vperm 0,0,11,9 + xvmulsp 33,12,44 + xvmulsp 11,12,45 + xvmaddasp 33,0,45 + xvmsubmsp 44,0,11 + xvaddsp 33,33,40 + xvsubsp 32,32,44 +#else + lxvx 33,3,6 + lxvx 32,3,5 + lxvx 43,9,6 + lxvx 44,9,5 + vperm 13,0,1,10 + vperm 0,0,1,9 + vperm 8,12,11,10 + vperm 12,12,11,9 + xvmulsp 33,12,32 + xvmulsp 11,12,45 + xvmsubasp 33,0,45 + xvmaddmsp 32,0,11 + xvaddsp 33,33,40 + xvaddsp 32,32,44 +#endif + vmrglw 13,0,1 + vmrghw 0,0,1 + stxvx 45,9,6 + stxvx 32,9,5 + addi 6,6,32 + addi 5,5,32 + bdnz .L11 + rldicr 0,0,0,61 + ld 31,-8(1) + sldi 9,0,1 + add 4,4,0 + add 11,11,9 +.L10: + sldi 5,11,2 + addi 6,4,1 + addi 9,11,2 + addi 3,5,4 + lfsx 12,8,5 + cmpd 7,7,6 + lfsx 0,10,5 + lfsx 11,8,3 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,12,1,11 +#else + fmsubs 12,12,1,11 +#endif + fadds 0,0,12 + stfsx 0,10,5 + lfsx 11,8,5 + lfsx 12,8,3 + lfsx 0,10,3 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,12,1,11 + fsubs 0,0,12 +#else + fmadds 12,12,1,11 + fadds 0,0,12 +#endif + stfsx 0,10,3 + ble 7,.L33 + sldi 9,9,2 + addi 5,4,2 + addi 6,11,4 + addi 3,9,4 + lfsx 12,8,9 + cmpd 7,7,5 + lfsx 0,10,9 + lfsx 11,8,3 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,3 + lfsx 0,10,3 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,3 + ble 7,.L33 + sldi 6,6,2 + addi 4,4,3 + addi 9,11,6 + addi 5,6,4 + lfsx 12,8,6 + cmpd 7,7,4 + lfsx 0,10,6 + lfsx 11,8,5 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,6 + lfsx 11,8,6 + lfsx 12,8,5 + lfsx 0,10,5 + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfsx 0,10,5 + ble 7,.L33 + sldi 9,9,2 + addi 7,9,4 + lfsx 12,8,9 + lfsx 0,10,9 + lfsx 11,8,7 + fmuls 11,2,11 +#ifdef CONJ + fmadds 12,1,12,11 +#else + fmsubs 12,1,12,11 +#endif + fadds 0,0,12 + stfsx 0,10,9 + lfsx 11,8,9 + lfsx 12,8,7 + lfsx 0,10,7 + fmuls 2,2,11 +#ifdef CONJ + fmsubs 1,1,12,2 + fsubs 1,0,1 +#else + fmadds 1,1,12,2 + fadds 1,0,1 +#endif + stfsx 1,10,7 + b .L33 +.L39: + mr 6,0 + b .L9 +.L38: +#ifdef CONJ + fneg 0,1 + xxpermdi 45,1,1,0 + xscvdpspn 12,2 + addis 9,2,.LANCHOR0@toc@ha + sradi. 3,4,1 + xxpermdi 44,0,0,0 + addi 9,9,.LANCHOR0@toc@l + xvcvdpsp 45,45 + lxv 33,0(9) + xvcvdpsp 32,44 + xxspltw 12,12,0 +#else + fneg 12,2 + xxpermdi 32,2,2,0 + xscvdpspn 0,1 + addis 9,2,.LANCHOR0@toc@ha + sradi. 3,4,1 + xxpermdi 45,12,12,0 + addi 9,9,.LANCHOR0@toc@l + xvcvdpsp 32,32 + lxv 33,0(9) + xvcvdpsp 45,45 + xxspltw 0,0,0 +#endif + vmrgew 0,0,13 + beq 0,.L5 + mr 6,8 + mr 9,10 + li 5,0 + .p2align 4,,15 +.L6: + lxv 38,16(6) + lxv 11,16(9) + addi 5,5,8 + addi 6,6,128 + addi 9,9,128 + lxv 39,-96(6) + lxv 40,-80(6) + lxv 41,-64(6) + lxv 42,-48(6) + cmpd 7,3,5 + lxv 43,-32(6) + lxv 45,-128(6) + lxv 44,-16(6) +#ifdef CONJ + lxv 0,-128(9) + vpermr 17,6,6,1 + xvmaddmsp 38,32,11 + lxv 11,-96(9) + vpermr 18,7,7,1 + vpermr 19,8,8,1 + vpermr 2,9,9,1 + vpermr 3,10,10,1 + vpermr 4,11,11,1 + xvmaddasp 0,32,45 + vpermr 5,12,12,1 + xvmaddmsp 39,32,11 + lxv 11,-80(9) + vpermr 13,13,13,1 + xvmaddasp 38,12,49 + xvmaddmsp 40,32,11 + lxv 11,-64(9) + xvmaddmsp 45,12,0 + xvmaddasp 39,12,50 + stxv 38,-112(9) + xvmaddmsp 41,32,11 + lxv 11,-48(9) + xvmaddasp 40,12,51 + stxv 45,-128(9) + stxv 39,-96(9) + xvmaddmsp 42,32,11 + lxv 11,-32(9) + xvmaddasp 41,12,34 + stxv 40,-80(9) + xvmaddmsp 43,32,11 + lxv 11,-16(9) + xvmaddasp 42,12,35 + stxv 41,-64(9) + xvmaddmsp 44,32,11 + xvmaddasp 43,12,36 + stxv 42,-48(9) + xvmaddasp 44,12,37 +#else + lxv 12,-128(9) + vpermr 17,6,6,1 + xvmaddmsp 38,0,11 + lxv 11,-96(9) + vpermr 18,7,7,1 + vpermr 19,8,8,1 + vpermr 2,9,9,1 + vpermr 3,10,10,1 + vpermr 4,11,11,1 + xvmaddasp 12,0,45 + vpermr 5,12,12,1 + xvmaddmsp 39,0,11 + lxv 11,-80(9) + vpermr 13,13,13,1 + xvmaddasp 38,32,49 + xvmaddmsp 40,0,11 + lxv 11,-64(9) + xvmaddmsp 45,32,12 + xvmaddasp 39,32,50 + stxv 38,-112(9) + xvmaddmsp 41,0,11 + lxv 11,-48(9) + xvmaddasp 40,32,51 + stxv 45,-128(9) + stxv 39,-96(9) + xvmaddmsp 42,0,11 + lxv 11,-32(9) + xvmaddasp 41,32,34 + stxv 40,-80(9) + xvmaddmsp 43,0,11 + lxv 11,-16(9) + xvmaddasp 42,32,35 + stxv 41,-64(9) + xvmaddmsp 44,0,11 + xvmaddasp 43,32,36 + stxv 42,-48(9) + xvmaddasp 44,32,37 +#endif + stxv 43,-32(9) + stxv 44,-16(9) + bgt 7,.L6 +.L5: + cmpd 7,7,4 + ble 7,.L33 + sldi 11,4,1 + b .L4 +.L7: + addi 10,4,1 + subf 8,4,7 + cmpd 7,10,7 + mtctr 8 + bgt 7,.L26 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,7,10 + beq 7,.L26 + .p2align 4,,15 +.L13: + lfs 10,4(3) + lfs 11,0(3) + lfs 12,0(9) + lfs 0,4(9) + addi 3,3,8 + addi 9,9,8 + fmuls 10,2,10 +#ifdef CONJ + fmadds 11,1,11,10 +#else + fmsubs 11,1,11,10 +#endif + fadds 12,12,11 + stfs 12,-8(9) + lfs 11,-8(3) + lfs 12,-4(3) + fmuls 11,2,11 +#ifdef CONJ + fmsubs 12,1,12,11 + fsubs 0,0,12 +#else + fmadds 12,1,12,11 + fadds 0,0,12 +#endif + stfs 0,-4(9) + bdnz .L13 + b .L33 +.L40: + li 31,1 + mtctr 31 + b .L11 +.L26: + li 10,1 + mtctr 10 + b .L13 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size caxpy_k,.-caxpy_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 31 + .byte 30 + .byte 29 + .byte 28 + .byte 23 + .byte 22 + .byte 21 + .byte 20 + .byte 15 + .byte 14 + .byte 13 + .byte 12 + .byte 7 + .byte 6 + .byte 5 + .byte 4 +.LC3: + .byte 27 + .byte 26 + .byte 25 + .byte 24 + .byte 19 + .byte 18 + .byte 17 + .byte 16 + .byte 11 + .byte 10 + .byte 9 + .byte 8 + .byte 3 + .byte 2 + .byte 1 + .byte 0 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .gnu_attribute 4, 1 + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S new file mode 100644 index 000000000..01d194c0c --- /dev/null +++ b/kernel/power/cdot_power9.S @@ -0,0 +1,242 @@ + .file "cdot.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl cdot_k + .type cdot_k, @function +cdot_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry cdot_k,.-cdot_k + mr. 9,3 + ble 0,.L10 + cmpdi 7,5,1 + beq 7,.L18 +.L3: + mtctr 9 + xxlxor 2,2,2 + sldi 5,5,3 + sldi 7,7,3 +#ifdef CONJ + fmr 12,2 +#endif + fmr 8,2 +#ifndef CONJ + fmr 9,2 +#endif + fmr 1,2 + .p2align 4,,15 +.L9: +#ifdef CONJ + lfs 9,0(4) + lfs 11,0(6) + lfs 10,4(6) + lfs 0,4(4) + add 6,6,7 + add 4,4,5 + fmadds 1,9,11,1 + fmadds 12,9,10,12 + fmadds 8,0,10,8 + fmadds 2,11,0,2 +#else + lfs 10,0(4) + lfs 12,0(6) + lfs 11,4(6) + lfs 0,4(4) + add 6,6,7 + add 4,4,5 + fmadds 1,10,12,1 + fmadds 8,10,11,8 + fmadds 9,0,11,9 + fmadds 2,12,0,2 +#endif + bdnz .L9 +.L7: +#ifdef CONJ + fsubs 2,12,2 + fadds 1,1,8 +#else + fadds 2,2,8 + fsubs 1,1,9 +#endif + blr + .p2align 4,,15 +.L18: + cmpdi 7,7,1 + bne 7,.L3 + rldicr. 10,9,0,60 + bne 0,.L19 + xxlxor 2,2,2 + li 8,0 +#ifdef CONJ + fmr 12,2 +#endif + fmr 8,2 +#ifndef CONJ + fmr 9,2 +#endif + fmr 1,2 +.L4: + addi 7,10,1 + sldi 8,8,2 + subf 10,10,9 + cmpd 7,7,9 + mtctr 10 + add 4,4,8 + add 6,6,8 + bgt 7,.L16 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L16 + .p2align 4,,15 +.L8: +#ifdef CONJ + lfs 9,0(4) + lfs 11,0(6) + lfs 10,4(6) + lfs 0,4(4) + addi 6,6,8 + addi 4,4,8 + fmadds 1,9,11,1 + fmadds 12,9,10,12 + fmadds 8,0,10,8 + fmadds 2,11,0,2 +#else + lfs 10,0(4) + lfs 12,0(6) + lfs 11,4(6) + lfs 0,4(4) + addi 6,6,8 + addi 4,4,8 + fmadds 1,10,12,1 + fmadds 8,10,11,8 + fmadds 9,0,11,9 + fmadds 2,12,0,2 +#endif + bdnz .L8 + b .L7 + .p2align 4,,15 +.L10: + xxlxor 1,1,1 + fmr 2,1 + blr +.L19: + addis 8,2,.LANCHOR0@toc@ha + sradi. 3,10,1 + xxspltib 42,0 + addi 8,8,.LANCHOR0@toc@l + lxv 32,0(8) + beq 0,.L12 + xxlor 6,42,42 + xxlor 4,42,42 + xxlor 0,42,42 + xxlor 7,42,42 + xxlor 5,42,42 + xxlor 3,42,42 + xxlor 12,42,42 + mr 7,4 + mr 8,6 + li 5,0 + .p2align 4,,15 +.L6: + lxv 43,0(8) + lxv 44,16(8) + addi 5,5,4 + addi 8,8,64 + addi 7,7,64 + lxv 45,-32(8) + lxv 33,-16(8) + lxv 8,-64(7) + lxv 9,-48(7) + cmpd 7,3,5 + lxv 10,-32(7) + lxv 11,-16(7) + vpermr 6,11,11,0 + vpermr 7,12,12,0 + vpermr 8,13,13,0 + vpermr 9,1,1,0 + xvmaddasp 12,43,8 + xvmaddasp 3,44,9 + xvmaddasp 0,8,38 + xvmaddasp 4,9,39 + xvmaddasp 6,10,40 + xvmaddasp 5,45,10 + xvmaddasp 42,11,41 + xvmaddasp 7,33,11 + bgt 7,.L6 + xvaddsp 12,12,3 + xvaddsp 0,0,4 + xvaddsp 12,12,5 + xvaddsp 0,0,6 + xvaddsp 12,12,7 + xvaddsp 42,0,42 +.L5: +#ifdef CONJ + xxpermdi 8,12,12,2 + xxpermdi 0,42,42,2 + cmpd 7,9,10 + sldi 8,10,1 + xvaddsp 8,8,12 + xvaddsp 0,0,42 + xxsldwi 1,8,8,3 + xxsldwi 12,0,0,3 + xxsldwi 8,8,8,2 + xxsldwi 0,0,0,2 + xscvspdp 1,1 + xscvspdp 12,12 + xscvspdp 8,8 +#else + xxpermdi 9,12,12,2 + xxpermdi 0,42,42,2 + cmpd 7,9,10 + sldi 8,10,1 + xvaddsp 9,9,12 + xvaddsp 0,0,42 + xxsldwi 1,9,9,3 + xxsldwi 2,0,0,3 + xxsldwi 9,9,9,2 + xxsldwi 0,0,0,2 + xscvspdp 8,2 + xscvspdp 1,1 + xscvspdp 9,9 +#endif + xscvspdp 2,0 + bgt 7,.L4 + b .L7 +.L12: + xxlor 12,42,42 + b .L5 +.L16: + li 9,1 + mtctr 9 + b .L8 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size cdot_k,.-cdot_k + .section .rodata + .align 4 + .set .LANCHOR0,. + 0 + .type swap_mask_arr, @object + .size swap_mask_arr, 16 +swap_mask_arr: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamax_power8.S b/kernel/power/icamax_power8.S new file mode 100644 index 000000000..4872aff40 --- /dev/null +++ b/kernel/power/icamax_power8.S @@ -0,0 +1,458 @@ +/* .file "icamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamax_k + .type icamax_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +icamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamax_k,.-icamax_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L54 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,9,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + mtctr 9 + add 4,4,5 + li 3,0 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L52: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L54: + rldicr. 8,9,0,58 + bne 0,.L55 + addi 7,8,1 + li 10,0 + xxlxor 11,11,11 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + li 3,0 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L52 + .p2align 4,,15 +.L55: + li 0,-144 + std 31,-8(1) + addis 5,2,.LC2@toc@ha + vspltisw 18,0 + vspltisw 19,0 + addis 6,2,.LC3@toc@ha + addi 5,5,.LC2@toc@l + stvx 24,1,0 + li 0,-128 + addi 6,6,.LC3@toc@l + xxlor 49,50,50 + addis 7,2,.LC4@toc@ha + lxvd2x 44,0,5 + addis 10,2,.LC5@toc@ha + stvx 25,1,0 + li 0,-112 + addi 7,7,.LC4@toc@l + lxvd2x 45,0,6 + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + stvx 26,1,0 + li 0,-96 + addi 10,10,.LC5@toc@l + addi 6,6,.LC7@toc@l + addi 5,5,.LC6@toc@l + stvx 27,1,0 + li 0,-80 + lxvd2x 46,0,10 + xxpermdi 44,44,44,2 + mr 10,4 + lxvd2x 48,0,6 + lxvd2x 47,0,5 + xxpermdi 45,45,45,2 + li 6,0 + stvx 28,1,0 + li 0,-64 + xxlnand 44,44,44 + xxlnand 45,45,45 + stvx 29,1,0 + li 0,-48 + vspltisw 29,8 + vadduwm 29,29,29 + xxpermdi 46,46,46,2 + stvx 30,1,0 + li 0,-32 + xxpermdi 47,47,47,2 + xxpermdi 48,48,48,2 + stvx 31,1,0 + lxvd2x 63,0,7 + addis 7,2,.LC8@toc@ha + addi 7,7,.LC8@toc@l + lxvd2x 62,0,7 + xxpermdi 63,63,63,2 + .p2align 4,,15 +.L5: + addi 3,10,16 + addi 5,10,32 + lxvd2x 34,0,10 + addi 7,10,64 + addi 31,10,48 + addi 12,10,80 + addi 11,10,96 + lxvd2x 36,0,3 + lxvd2x 37,0,5 + addi 3,10,112 + addi 5,10,128 + lxvd2x 38,0,7 + lxvd2x 7,0,31 + addi 7,10,160 + addi 31,10,144 + lxvd2x 33,0,12 + lxvd2x 39,0,11 + addi 12,10,176 + addi 11,10,192 + lxvd2x 8,0,3 + lxvd2x 40,0,5 + xxpermdi 34,34,34,2 + addi 3,10,208 + addi 5,10,224 + lxvd2x 41,0,7 + lxvd2x 9,0,31 + addi 7,10,240 + lxvd2x 10,0,12 + lxvd2x 42,0,11 + xxpermdi 37,37,37,2 + xxpermdi 36,36,36,2 + addi 6,6,32 + lxvd2x 32,0,3 + lxvd2x 43,0,5 + xxpermdi 7,7,7,2 + xxpermdi 38,38,38,2 + cmpd 7,8,6 + addi 10,10,256 + lxvd2x 11,0,7 + xxpermdi 39,39,39,2 + xxpermdi 33,33,33,2 + xxpermdi 40,40,40,2 + xxpermdi 8,8,8,2 + xxpermdi 41,41,41,2 + xxpermdi 9,9,9,2 + xxpermdi 10,10,10,2 + xxpermdi 42,42,42,2 + xxpermdi 43,43,43,2 + xxpermdi 32,32,32,2 + xxpermdi 11,11,11,2 + xvabssp 57,37 + xvabssp 58,39 + xvabssp 35,40 + xvabssp 59,41 + xvabssp 34,34 + xvabssp 33,33 + xvabssp 32,32 + xvabssp 60,43 + xvabssp 36,36 + xvabssp 37,7 + xvabssp 38,38 + xvabssp 39,8 + xvabssp 40,9 + xvabssp 41,10 + xvabssp 42,42 + xvabssp 43,11 + vperm 24,4,2,12 + vperm 4,4,2,13 + vperm 2,5,25,12 + vperm 5,5,25,13 + vperm 25,1,6,12 + vperm 6,1,6,13 + vperm 1,7,26,12 + vperm 7,7,26,13 + vperm 26,8,3,12 + vperm 8,8,3,13 + vperm 3,9,27,12 + vperm 9,9,27,13 + vperm 27,0,10,12 + vperm 10,0,10,13 + vperm 0,11,28,12 + vperm 11,11,28,13 + xvaddsp 12,33,39 + xvaddsp 38,57,38 + xvaddsp 0,32,43 + xvaddsp 42,59,42 + xvaddsp 36,56,36 + xvaddsp 37,34,37 + xvaddsp 40,58,40 + xvaddsp 41,35,41 + xvcmpgtsp 32,12,38 + xvcmpgtsp 33,0,42 + xvcmpgtsp 43,37,36 + xvcmpgtsp 39,41,40 + xxsel 12,38,12,32 + xxsel 38,47,48,32 + xxsel 0,42,0,33 + xxsel 42,47,48,33 + xxsel 37,36,37,43 + xxsel 43,63,46,43 + xxsel 41,40,41,39 + xxsel 39,63,46,39 + xvcmpgtsp 32,12,37 + xvcmpgtsp 33,0,41 + xxsel 12,37,12,32 + xxsel 43,43,38,32 + xxsel 0,41,0,33 + xxsel 33,39,42,33 + xvcmpgtsp 32,0,12 + vadduwm 1,1,29 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,0,51 + vadduwm 0,17,0 + vadduwm 17,17,30 + xxsel 50,50,32,33 + xxsel 51,51,0,33 + bgt 7,.L5 + xxsldwi 11,51,51,3 + xxsldwi 12,51,51,2 + vspltw 0,18,3 + xxsldwi 0,51,51,1 + xscvspdp 11,11 + xscvspdp 12,12 + mfvsrwz 6,32 + vspltw 0,18,2 + xscvspdp 0,0 + mfvsrwz 7,50 + mfvsrwz 5,32 + vspltw 0,18,0 + xscvspdp 51,51 + mfvsrwz 10,32 + fcmpu 7,11,12 + rldicl 3,6,0,32 + fmr 10,0 + rldicl 11,7,0,32 + rldicl 31,5,0,32 + rldicl 0,10,0,32 + beq 7,.L56 + bnl 7,.L8 + fmr 11,12 + mr 3,31 +.L8: + xscmpudp 7,0,51 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L57 + blt 7,.L58 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + li 0,-144 + ld 31,-8(1) + addi 3,3,1 + lvx 24,1,0 + li 0,-128 + lvx 25,1,0 + li 0,-112 + lvx 26,1,0 + li 0,-96 + lvx 27,1,0 + li 0,-80 + lvx 28,1,0 + li 0,-64 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L56: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bnl 7,.L13 + xscpsgndp 10,51,51 + mr 11,0 + b .L13 + .p2align 4,,15 +.L57: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 + .p2align 4,,15 +.L58: + fmr 11,10 + mr 3,11 + b .L17 +.L43: + li 9,1 + mtctr 9 + b .L44 +.L37: + li 9,1 + mtctr 9 + b .L21 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size icamax_k,.-icamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 +.LC8: + .long 32 + .long 32 + .long 32 + .long 32 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamax_power9.S b/kernel/power/icamax_power9.S new file mode 100644 index 000000000..2968b3f8b --- /dev/null +++ b/kernel/power/icamax_power9.S @@ -0,0 +1,387 @@ + .file "icamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamax_k + .type icamax_k, @function +icamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamax_k,.-icamax_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L53 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,9,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + li 3,0 + mtctr 9 + add 4,4,5 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L51: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L53: + rldicr. 8,9,0,58 + bne 0,.L54 + addi 7,8,1 + li 10,0 + subf 6,8,9 + li 3,0 + xxlxor 11,11,11 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L51 + .p2align 4,,15 +.L54: + addis 11,2,.LC2@toc@ha + addis 3,2,.LC3@toc@ha + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + xxspltib 47,0 + addis 7,2,.LC4@toc@ha + addis 10,2,.LC5@toc@ha + stxv 58,-96(1) + stxv 59,-80(1) + addi 11,11,.LC2@toc@l + addi 3,3,.LC3@toc@l + addi 5,5,.LC6@toc@l + addi 6,6,.LC7@toc@l + stxv 62,-32(1) + stxv 63,-16(1) + xxspltib 58,16 + addi 7,7,.LC4@toc@l + addi 10,10,.LC5@toc@l + xxspltib 59,32 + lxv 44,0(11) + lxv 45,0(3) + xxspltib 48,0 + lxv 62,0(5) + xxlor 46,47,47 + lxv 63,0(6) + stxv 60,-64(1) + stxv 61,-48(1) + lxv 60,0(7) + lxv 61,0(10) + li 7,0 + mr 10,4 + vextsb2w 26,26 + vextsb2w 27,27 + stxv 56,-128(1) + stxv 57,-112(1) + .p2align 4,,15 +.L5: + lxv 0,0(10) + addi 7,7,32 + addi 10,10,256 + cmpd 7,8,7 + xvabssp 34,0 + lxv 0,-240(10) + xvabssp 42,0 + lxv 0,-224(10) + xvabssp 49,0 + lxv 0,-208(10) + vpermr 25,10,2,12 + vpermr 2,10,2,13 + xvabssp 35,0 + lxv 0,-192(10) + xvaddsp 34,57,34 + xvabssp 36,0 + lxv 0,-176(10) + vpermr 10,3,17,12 + vpermr 3,3,17,13 + xvabssp 33,0 + lxv 0,-160(10) + xvaddsp 10,42,35 + xvabssp 50,0 + lxv 0,-144(10) + vpermr 17,1,4,12 + vpermr 4,1,4,13 + xvabssp 37,0 + lxv 0,-128(10) + xvaddsp 36,49,36 + xvabssp 38,0 + lxv 0,-112(10) + vpermr 1,5,18,12 + vpermr 5,5,18,13 + xvabssp 43,0 + lxv 0,-96(10) + xvaddsp 12,33,37 + xvabssp 51,0 + lxv 0,-80(10) + vpermr 18,11,6,12 + vpermr 6,11,6,13 + xvabssp 39,0 + lxv 0,-64(10) + xvaddsp 38,50,38 + xvabssp 40,0 + lxv 0,-48(10) + vpermr 11,7,19,12 + vpermr 7,7,19,13 + xvabssp 32,0 + lxv 0,-32(10) + xvaddsp 11,43,39 + xvcmpgtsp 39,10,34 + xvcmpgtsp 43,12,36 + xvabssp 56,0 + lxv 0,-16(10) + vpermr 19,0,8,12 + vpermr 8,0,8,13 + xxsel 10,34,10,39 + xxsel 12,36,12,43 + xxsel 39,60,61,39 + xxsel 43,62,63,43 + xvabssp 41,0 + xvaddsp 40,51,40 + vpermr 0,9,24,12 + vpermr 9,9,24,13 + xvaddsp 0,32,41 + xvcmpgtsp 41,11,38 + xvcmpgtsp 32,12,10 + xvcmpgtsp 42,0,40 + xxsel 11,38,11,41 + xxsel 12,10,12,32 + xxsel 43,39,43,32 + xxsel 41,60,61,41 + xxsel 0,40,0,42 + xxsel 42,62,63,42 + xvcmpgtsp 33,0,11 + xxsel 0,11,0,33 + xxsel 33,41,42,33 + xvcmpgtsp 32,0,12 + vadduwm 1,1,26 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,0,48 + vadduwm 0,14,0 + vadduwm 14,14,27 + xxsel 47,47,32,33 + xxsel 48,48,0,33 + bgt 7,.L5 + xxsldwi 11,48,48,3 + xxsldwi 12,48,48,2 + li 10,0 + li 3,12 + xxsldwi 0,48,48,1 + xscvspdp 48,48 + vextuwrx 6,10,15 + li 10,4 + xscvspdp 11,11 + xscvspdp 12,12 + xscvspdp 0,0 + vextuwrx 5,10,15 + li 10,8 + vextuwrx 7,10,15 + vextuwrx 10,3,15 + rldicl 12,5,0,32 + rldicl 3,6,0,32 + rldicl 11,7,0,32 + rldicl 0,10,0,32 + fcmpu 7,11,12 + fmr 10,0 + beq 7,.L55 + bnl 7,.L8 + mr 3,12 + fmr 11,12 +.L8: + xscmpudp 7,0,48 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L56 + bnl 7,.L17 + mr 3,11 + fmr 11,10 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + subf 6,8,9 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,4(4) + lfs 12,0(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + lxv 56,-128(1) + lxv 57,-112(1) + addi 3,3,1 + lxv 58,-96(1) + lxv 59,-80(1) + lxv 60,-64(1) + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L55: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bnl 7,.L13 + mr 11,0 + xscpsgndp 10,48,48 + b .L13 + .p2align 4,,15 +.L56: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 +.L37: + li 9,1 + mtctr 9 + b .L21 +.L43: + li 9,1 + mtctr 9 + b .L44 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size icamax_k,.-icamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S new file mode 100644 index 000000000..e3d66798e --- /dev/null +++ b/kernel/power/icamin_power8.S @@ -0,0 +1,454 @@ +/* .file "icamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamin_k + .type icamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +icamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamin_k,.-icamin_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,5,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L54 + cmpdi 7,9,1 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + mtctr 9 + add 4,4,5 + li 3,0 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bnl 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L52: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L54: + rldicr. 8,9,0,58 + bne 0,.L55 + addi 7,8,1 + li 10,0 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + li 3,0 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L52 + .p2align 4,,15 +.L55: + li 0,-128 + std 31,-8(1) + addis 5,2,.LC2@toc@ha + xscvdpspn 11,11 + vspltisw 19,0 + addis 6,2,.LC3@toc@ha + addi 5,5,.LC2@toc@l + stvx 25,1,0 + li 0,-112 + addi 6,6,.LC3@toc@l + xxlor 50,51,51 + addis 7,2,.LC4@toc@ha + lxvd2x 44,0,5 + addis 10,2,.LC5@toc@ha + stvx 26,1,0 + li 0,-96 + addi 7,7,.LC4@toc@l + lxvd2x 45,0,6 + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + stvx 27,1,0 + li 0,-80 + addi 10,10,.LC5@toc@l + xxspltw 5,11,0 + addi 6,6,.LC7@toc@l + addi 5,5,.LC6@toc@l + stvx 28,1,0 + li 0,-64 + lxvd2x 47,0,10 + xxpermdi 44,44,44,2 + mr 10,4 + lxvd2x 49,0,6 + lxvd2x 48,0,5 + xxpermdi 45,45,45,2 + li 6,0 + stvx 29,1,0 + li 0,-48 + xxlnand 44,44,44 + xxlnand 45,45,45 + stvx 30,1,0 + lxvd2x 62,0,7 + addis 7,2,.LC8@toc@ha + li 0,-32 + addi 7,7,.LC8@toc@l + xxpermdi 47,47,47,2 + stvx 31,1,0 + vspltisw 31,8 + xxpermdi 48,48,48,2 + lxvd2x 46,0,7 + vadduwm 31,31,31 + xxpermdi 49,49,49,2 + xxpermdi 62,62,62,2 + .p2align 4,,15 +.L5: + addi 3,10,16 + addi 5,10,32 + lxvd2x 34,0,10 + addi 7,10,64 + addi 31,10,48 + addi 12,10,80 + addi 11,10,96 + lxvd2x 36,0,3 + lxvd2x 37,0,5 + addi 3,10,112 + addi 5,10,128 + lxvd2x 38,0,7 + lxvd2x 6,0,31 + addi 7,10,160 + addi 31,10,144 + lxvd2x 33,0,12 + lxvd2x 39,0,11 + addi 12,10,176 + addi 11,10,192 + lxvd2x 7,0,3 + lxvd2x 40,0,5 + xxpermdi 34,34,34,2 + addi 3,10,208 + addi 5,10,224 + lxvd2x 41,0,7 + lxvd2x 8,0,31 + addi 7,10,240 + lxvd2x 9,0,12 + lxvd2x 42,0,11 + xxpermdi 37,37,37,2 + xxpermdi 36,36,36,2 + addi 6,6,32 + lxvd2x 32,0,3 + lxvd2x 43,0,5 + xxpermdi 6,6,6,2 + xxpermdi 38,38,38,2 + cmpd 7,8,6 + addi 10,10,256 + lxvd2x 10,0,7 + xxpermdi 39,39,39,2 + xxpermdi 33,33,33,2 + xxpermdi 40,40,40,2 + xxpermdi 7,7,7,2 + xxpermdi 41,41,41,2 + xxpermdi 8,8,8,2 + xxpermdi 9,9,9,2 + xxpermdi 42,42,42,2 + xxpermdi 43,43,43,2 + xxpermdi 32,32,32,2 + xxpermdi 10,10,10,2 + xvabssp 58,37 + xvabssp 59,39 + xvabssp 35,40 + xvabssp 60,41 + xvabssp 34,34 + xvabssp 33,33 + xvabssp 32,32 + xvabssp 61,43 + xvabssp 36,36 + xvabssp 37,6 + xvabssp 38,38 + xvabssp 39,7 + xvabssp 40,8 + xvabssp 41,9 + xvabssp 42,42 + xvabssp 43,10 + vperm 25,4,2,12 + vperm 4,4,2,13 + vperm 2,5,26,12 + vperm 5,5,26,13 + vperm 26,1,6,12 + vperm 6,1,6,13 + vperm 1,7,27,12 + vperm 7,7,27,13 + vperm 27,8,3,12 + vperm 8,8,3,13 + vperm 3,9,28,12 + vperm 9,9,28,13 + vperm 28,0,10,12 + vperm 10,0,10,13 + vperm 0,11,29,12 + vperm 11,11,29,13 + xvaddsp 12,33,39 + xvaddsp 38,58,38 + xvaddsp 0,32,43 + xvaddsp 42,60,42 + xvaddsp 36,57,36 + xvaddsp 37,34,37 + xvaddsp 40,59,40 + xvaddsp 41,35,41 + xvcmpgtsp 32,38,12 + xvcmpgtsp 33,42,0 + xvcmpgtsp 43,36,37 + xvcmpgtsp 39,40,41 + xxsel 12,38,12,32 + xxsel 38,48,49,32 + xxsel 0,42,0,33 + xxsel 42,48,49,33 + xxsel 37,36,37,43 + xxsel 43,62,47,43 + xxsel 41,40,41,39 + xxsel 39,62,47,39 + xvcmpgtsp 32,37,12 + xvcmpgtsp 33,41,0 + xxsel 12,37,12,32 + xxsel 43,43,38,32 + xxsel 0,41,0,33 + xxsel 33,39,42,33 + xvcmpgtsp 32,12,0 + vadduwm 1,1,31 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,5,0 + vadduwm 0,0,18 + vadduwm 18,18,14 + xxsel 51,51,32,33 + xxsel 5,5,0,33 + bgt 7,.L5 + xxsldwi 11,5,5,3 + xxsldwi 12,5,5,2 + vspltw 0,19,3 + xxsldwi 0,5,5,1 + xscvspdp 11,11 + xscvspdp 12,12 + mfvsrwz 6,32 + vspltw 0,19,2 + xscvspdp 0,0 + mfvsrwz 7,51 + mfvsrwz 5,32 + vspltw 0,19,0 + xscvspdp 5,5 + mfvsrwz 10,32 + fcmpu 7,11,12 + rldicl 3,6,0,32 + fmr 10,0 + rldicl 11,7,0,32 + rldicl 31,5,0,32 + rldicl 0,10,0,32 + beq 7,.L56 + bng 7,.L8 + fmr 11,12 + mr 3,31 +.L8: + fcmpu 7,0,5 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L57 + bgt 7,.L58 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + cmpd 7,7,9 + sldi 10,10,2 + add 4,4,10 + subf 10,8,9 + mtctr 10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + li 0,-128 + ld 31,-8(1) + addi 3,3,1 + lvx 25,1,0 + li 0,-112 + lvx 26,1,0 + li 0,-96 + lvx 27,1,0 + li 0,-80 + lvx 28,1,0 + li 0,-64 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L56: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bng 7,.L13 + fmr 10,5 + mr 11,0 + b .L13 + .p2align 4,,15 +.L57: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 + .p2align 4,,15 +.L58: + fmr 11,10 + mr 3,11 + b .L17 +.L43: + li 9,1 + mtctr 9 + b .L44 +.L37: + li 9,1 + mtctr 9 + b .L21 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size icamin_k,.-icamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 +.LC8: + .long 32 + .long 32 + .long 32 + .long 32 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/icamin_power9.S b/kernel/power/icamin_power9.S new file mode 100644 index 000000000..8eaa79f33 --- /dev/null +++ b/kernel/power/icamin_power9.S @@ -0,0 +1,385 @@ + .file "icamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl icamin_k + .type icamin_k, @function +icamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry icamin_k,.-icamin_k + mr. 9,3 + ble 0,.L25 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 11,0(4) + lfs 0,4(4) + cmpdi 7,5,1 + fabs 11,11 + fabs 0,0 + fadds 11,11,0 + beq 7,.L53 + cmpdi 7,9,1 + beq 7,.L29 + addi 9,9,-1 + sldi 5,5,3 + li 3,0 + mtctr 9 + add 4,4,5 + li 9,1 + .p2align 4,,15 +.L24: + lfs 0,4(4) + lfs 12,0(4) + add 4,4,5 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,0,11 + bnl 7,.L23 + fmr 11,0 + mr 3,9 +.L23: + addi 9,9,1 + bdnz .L24 +.L51: + addi 3,3,1 + blr + .p2align 4,,15 +.L25: + li 3,0 + blr + .p2align 4,,15 +.L53: + rldicr. 8,9,0,58 + bne 0,.L54 + addi 7,8,1 + li 10,0 + subf 6,8,9 + li 3,0 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L43 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L43 + .p2align 4,,15 +.L44: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L46 + fmr 11,0 + mr 3,8 +.L46: + addi 8,8,1 + bdnz .L44 + b .L51 + .p2align 4,,15 +.L54: + xscvdpspn 9,11 + addis 11,2,.LC2@toc@ha + addis 3,2,.LC3@toc@ha + addis 5,2,.LC6@toc@ha + addis 6,2,.LC7@toc@ha + addis 7,2,.LC4@toc@ha + addis 10,2,.LC5@toc@ha + xxspltib 48,0 + addi 11,11,.LC2@toc@l + addi 3,3,.LC3@toc@l + addi 5,5,.LC6@toc@l + stxv 59,-80(1) + addi 6,6,.LC7@toc@l + stxv 60,-64(1) + stxv 63,-16(1) + addi 7,7,.LC4@toc@l + xxspltib 59,16 + lxv 44,0(11) + xxspltib 60,32 + lxv 45,0(3) + lxv 63,0(5) + xxlor 47,48,48 + lxv 46,0(6) + addi 10,10,.LC5@toc@l + stxv 61,-48(1) + stxv 62,-32(1) + xxspltw 9,9,0 + lxv 61,0(7) + lxv 62,0(10) + li 7,0 + mr 10,4 + vextsb2w 27,27 + vextsb2w 28,28 + stxv 57,-112(1) + stxv 58,-96(1) + .p2align 4,,15 +.L5: + lxv 0,0(10) + addi 7,7,32 + addi 10,10,256 + cmpd 7,8,7 + xvabssp 34,0 + lxv 0,-240(10) + xvabssp 42,0 + lxv 0,-224(10) + xvabssp 49,0 + lxv 0,-208(10) + vpermr 26,10,2,12 + vpermr 2,10,2,13 + xvabssp 35,0 + lxv 0,-192(10) + xvaddsp 34,58,34 + xvabssp 36,0 + lxv 0,-176(10) + vpermr 10,3,17,12 + vpermr 3,3,17,13 + xvabssp 33,0 + lxv 0,-160(10) + xvaddsp 10,42,35 + xvabssp 50,0 + lxv 0,-144(10) + vpermr 17,1,4,12 + vpermr 4,1,4,13 + xvabssp 37,0 + lxv 0,-128(10) + xvaddsp 36,49,36 + xvabssp 38,0 + lxv 0,-112(10) + vpermr 1,5,18,12 + vpermr 5,5,18,13 + xvabssp 43,0 + lxv 0,-96(10) + xvaddsp 12,33,37 + xvabssp 51,0 + lxv 0,-80(10) + vpermr 18,11,6,12 + vpermr 6,11,6,13 + xvabssp 39,0 + lxv 0,-64(10) + xvaddsp 38,50,38 + xvabssp 40,0 + lxv 0,-48(10) + vpermr 11,7,19,12 + vpermr 7,7,19,13 + xvabssp 32,0 + lxv 0,-32(10) + xvaddsp 11,43,39 + xvcmpgtsp 39,34,10 + xvcmpgtsp 43,36,12 + xvabssp 57,0 + lxv 0,-16(10) + vpermr 19,0,8,12 + vpermr 8,0,8,13 + xxsel 10,34,10,39 + xxsel 12,36,12,43 + xxsel 39,61,62,39 + xxsel 43,63,46,43 + xvabssp 41,0 + xvaddsp 40,51,40 + vpermr 0,9,25,12 + vpermr 9,9,25,13 + xvaddsp 0,32,41 + xvcmpgtsp 41,38,11 + xvcmpgtsp 32,10,12 + xvcmpgtsp 42,40,0 + xxsel 11,38,11,41 + xxsel 12,10,12,32 + xxsel 43,39,43,32 + xxsel 41,61,62,41 + xxsel 0,40,0,42 + xxsel 42,63,46,42 + xvcmpgtsp 33,11,0 + xxsel 0,11,0,33 + xxsel 33,41,42,33 + xvcmpgtsp 32,12,0 + vadduwm 1,1,27 + xxsel 0,12,0,32 + xxsel 32,43,33,32 + xvcmpgtsp 33,9,0 + vadduwm 0,0,15 + vadduwm 15,15,28 + xxsel 48,48,32,33 + xxsel 9,9,0,33 + bgt 7,.L5 + xxsldwi 11,9,9,3 + xxsldwi 12,9,9,2 + li 10,0 + li 3,12 + xxsldwi 0,9,9,1 + xscvspdp 9,9 + vextuwrx 6,10,16 + li 10,4 + xscvspdp 11,11 + xscvspdp 12,12 + xscvspdp 0,0 + vextuwrx 5,10,16 + li 10,8 + vextuwrx 7,10,16 + vextuwrx 10,3,16 + rldicl 12,5,0,32 + rldicl 3,6,0,32 + rldicl 11,7,0,32 + rldicl 0,10,0,32 + fcmpu 7,11,12 + fmr 10,0 + beq 7,.L55 + bng 7,.L8 + mr 3,12 + fmr 11,12 +.L8: + fcmpu 7,0,9 + bne 7,.L11 + cmplw 7,7,10 + ble 7,.L12 + mr 7,10 +.L12: + rldicl 11,7,0,32 +.L13: + fcmpu 7,11,10 + beq 7,.L56 + bng 7,.L17 + mr 3,11 + fmr 11,10 +.L17: + cmpd 7,9,8 + ble 7,.L19 + addi 7,8,1 + sldi 10,8,1 + subf 6,8,9 + cmpd 7,7,9 + sldi 10,10,2 + mtctr 6 + add 4,4,10 + bgt 7,.L37 + li 10,-1 + rldicr 10,10,0,0 + cmpd 7,9,10 + beq 7,.L37 + .p2align 4,,15 +.L21: + lfs 0,0(4) + lfs 12,4(4) + addi 4,4,8 + fabs 0,0 + fabs 12,12 + fadds 0,0,12 + fcmpu 7,11,0 + bng 7,.L20 + fmr 11,0 + mr 3,8 +.L20: + addi 8,8,1 + bdnz .L21 +.L19: + lxv 57,-112(1) + lxv 58,-96(1) + addi 3,3,1 + lxv 59,-80(1) + lxv 60,-64(1) + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L55: + cmplw 7,6,5 + ble 7,.L7 + mr 6,5 +.L7: + rldicl 3,6,0,32 + b .L8 + .p2align 4,,15 +.L29: + li 3,1 + blr + .p2align 4,,15 +.L11: + bng 7,.L13 + mr 11,0 + fmr 10,9 + b .L13 + .p2align 4,,15 +.L56: + cmpd 7,3,11 + ble 7,.L17 + mr 3,11 + b .L17 +.L37: + li 9,1 + mtctr 9 + b .L21 +.L43: + li 9,1 + mtctr 9 + b .L44 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size icamin_k,.-icamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .byte 0 + .byte 1 + .byte 2 + .byte 3 + .byte 8 + .byte 9 + .byte 10 + .byte 11 + .byte 16 + .byte 17 + .byte 18 + .byte 19 + .byte 24 + .byte 25 + .byte 26 + .byte 27 +.LC3: + .byte 4 + .byte 5 + .byte 6 + .byte 7 + .byte 12 + .byte 13 + .byte 14 + .byte 15 + .byte 20 + .byte 21 + .byte 22 + .byte 23 + .byte 28 + .byte 29 + .byte 30 + .byte 31 +.LC4: + .long 0 + .long 1 + .long 2 + .long 3 +.LC5: + .long 4 + .long 5 + .long 6 + .long 7 +.LC6: + .long 8 + .long 9 + .long 10 + .long 11 +.LC7: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S new file mode 100644 index 000000000..c8fcaecc3 --- /dev/null +++ b/kernel/power/isamax_power8.S @@ -0,0 +1,434 @@ +/* .file "isamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamax_k + .type isamax_k, @function +*/ + +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +isamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamax_k,.-isamax_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L69 + rldicr. 7,11,0,61 + beq 0,.L40 + sldi 3,5,1 + xxlxor 0,0,0 + sldi 6,5,2 + add 3,3,5 + sldi 0,5,4 + sldi 3,3,2 + sldi 5,5,3 + mr 9,4 + li 8,0 + li 10,0 + .p2align 4,,15 +.L31: + lfs 12,0(9) + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L23 + fmr 0,12 + mr 8,10 +.L23: + lfsx 12,9,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L25 + fmr 0,12 + addi 8,10,1 +.L25: + lfsx 12,9,5 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L27 + fmr 0,12 + addi 8,10,2 +.L27: + lfsx 12,9,3 + add 9,9,0 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L29 + fmr 0,12 + addi 8,10,3 +.L29: + addi 10,10,4 + cmpd 7,7,10 + bgt 7,.L31 + addi 7,7,-1 + srdi 7,7,2 + addi 7,7,1 + sldi 9,7,2 + mulld 7,6,7 + cmpd 7,11,9 + ble 7,.L67 +.L22: + addi 10,9,1 + sldi 7,7,2 + cmpd 7,10,11 + subf 10,9,11 + mtctr 10 + add 4,4,7 + bgt 7,.L54 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L54 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L33 + fmr 0,12 + mr 8,9 +.L33: + addi 9,9,1 + bdnz .L35 +.L67: + addi 3,8,1 + blr + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L69: + rldicr. 10,11,0,57 + bne 0,.L70 + addi 7,10,1 + sldi 9,10,2 + xxlxor 12,12,12 + cmpd 7,7,11 + add 4,4,9 + subf 9,10,11 + li 8,0 + mtctr 9 + bgt 7,.L60 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L60 + .p2align 4,,15 +.L61: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L63 + fmr 12,0 + mr 8,10 +.L63: + addi 10,10,1 + bdnz .L61 + b .L67 + .p2align 4,,15 +.L70: + li 0,-64 + std 31,-8(1) + addis 3,2,.LC2@toc@ha + vspltisw 18,0 + vspltisw 12,0 + addis 5,2,.LC3@toc@ha + addis 6,2,.LC6@toc@ha + stvx 29,1,0 + li 0,-48 + addis 8,2,.LC7@toc@ha + xxlor 35,50,50 + addi 3,3,.LC2@toc@l + addi 5,5,.LC3@toc@l + stvx 30,1,0 + addi 6,6,.LC6@toc@l + li 0,-32 + addi 8,8,.LC7@toc@l + lxvd2x 51,0,3 + lxvd2x 34,0,5 + addis 7,2,.LC4@toc@ha + stvx 31,1,0 + lxvd2x 47,0,6 + addis 9,2,.LC5@toc@ha + addi 7,7,.LC4@toc@l + lxvd2x 48,0,8 + addi 9,9,.LC5@toc@l + vspltisw 17,8 + vadduwm 17,17,17 + lxvd2x 36,0,7 + li 7,0 + lxvd2x 37,0,9 + mr 9,4 + .p2align 4,,15 +.L5: + addi 5,9,16 + addi 6,9,32 + lxvd2x 41,0,9 + vadduwm 31,3,15 + addi 8,9,64 + addi 31,9,48 + addi 12,9,80 + addi 3,9,96 + lxvd2x 5,0,5 + lxvd2x 43,0,6 + addi 5,9,112 + addi 6,9,128 + lxvd2x 1,0,8 + lxvd2x 9,0,31 + addi 8,9,160 + addi 31,9,144 + lxvd2x 6,0,12 + lxvd2x 13,0,3 + addi 12,9,176 + addi 3,9,192 + lxvd2x 11,0,5 + lxvd2x 2,0,6 + xvabssp 41,41 + addi 5,9,208 + addi 6,9,224 + lxvd2x 3,0,8 + lxvd2x 7,0,31 + addi 8,9,240 + lxvd2x 10,0,12 + lxvd2x 4,0,3 + xvabssp 43,43 + xvabssp 5,5 + addi 7,7,64 + lxvd2x 8,0,5 + lxvd2x 0,0,6 + xvabssp 9,9 + xvabssp 1,1 + cmpd 7,10,7 + addi 9,9,256 + lxvd2x 12,0,8 + xvabssp 6,6 + xvabssp 13,13 + xvabssp 11,11 + xvabssp 2,2 + xvabssp 7,7 + xvabssp 3,3 + xvabssp 10,10 + xvabssp 4,4 + xvabssp 8,8 + xvabssp 0,0 + xvabssp 12,12 + xvcmpgtsp 32,5,41 + xvcmpgtsp 61,9,43 + xvcmpgtsp 45,6,1 + xvcmpgtsp 62,11,13 + xvcmpgtsp 38,7,2 + xvcmpgtsp 46,10,3 + xvcmpgtsp 40,8,4 + xvcmpgtsp 39,12,0 + xxsel 5,41,5,32 + xxsel 32,51,34,32 + xxsel 9,43,9,61 + xxsel 6,1,6,45 + xxsel 11,13,11,62 + xxsel 43,51,34,45 + xxsel 7,2,7,38 + xvcmpgtsp 41,9,5 + xxsel 10,3,10,46 + xvcmpgtsp 45,11,6 + xxsel 8,4,8,40 + xxsel 62,36,37,62 + xxsel 0,0,12,39 + xvcmpgtsp 42,10,7 + xxsel 61,36,37,61 + xxsel 40,51,34,40 + xvcmpgtsp 33,0,8 + xxsel 39,36,37,39 + xxsel 38,51,34,38 + xxsel 46,36,37,46 + xxsel 9,5,9,41 + xxsel 41,32,61,41 + xxsel 12,6,11,45 + xxsel 45,43,62,45 + xxsel 11,7,10,42 + xvcmpgtsp 32,12,9 + vadduwm 13,13,17 + xxsel 42,38,46,42 + xxsel 0,8,0,33 + xxsel 33,40,39,33 + xvcmpgtsp 43,0,11 + vadduwm 1,1,17 + xxsel 12,9,12,32 + xxsel 32,41,45,32 + vadduwm 0,3,0 + vadduwm 3,3,16 + xxsel 0,11,0,43 + xxsel 33,42,33,43 + xvcmpgtsp 45,0,12 + vadduwm 1,31,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,0,44 + xxsel 50,50,32,33 + xxsel 44,44,0,33 + bgt 7,.L5 + xxsldwi 12,44,44,1 + xscvspdp 10,44 + vspltw 0,18,0 + xxsldwi 0,44,44,3 + xscvspdp 12,12 + mfvsrwz 3,50 + mfvsrwz 6,32 + vspltw 0,18,3 + xscvspdp 0,0 + xxsldwi 44,44,44,2 + mfvsrwz 7,32 + vspltw 0,18,2 + xscvspdp 44,44 + mfvsrwz 9,32 + fcmpu 7,12,10 + rldicl 8,3,0,32 + rldicl 31,6,0,32 + fmr 11,0 + rldicl 0,7,0,32 + rldicl 5,9,0,32 + beq 7,.L71 + bnl 7,.L8 + fmr 12,10 + mr 8,31 +.L8: + xscmpudp 7,0,44 + bne 7,.L11 + cmplw 7,7,9 + ble 7,.L12 + mr 7,9 +.L12: + rldicl 5,7,0,32 +.L13: + fcmpu 7,12,11 + beq 7,.L72 + bnl 7,.L17 + fmr 12,11 + mr 8,5 +.L17: + cmpd 7,11,10 + ble 7,.L16 + addi 7,10,1 + sldi 9,10,2 + cmpd 7,7,11 + add 4,4,9 + subf 9,10,11 + mtctr 9 + bgt 7,.L53 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L53 + .p2align 4,,15 +.L21: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L19 + fmr 12,0 + mr 8,10 +.L19: + addi 10,10,1 + bdnz .L21 +.L16: + li 0,-64 + ld 31,-8(1) + addi 3,8,1 + lvx 29,1,0 + li 0,-48 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L71: + cmplw 7,3,6 + ble 7,.L7 + mr 3,6 +.L7: + rldicl 8,3,0,32 + b .L8 + .p2align 4,,15 +.L40: + xxlxor 0,0,0 + sldi 6,5,2 + li 8,0 + li 9,0 + b .L22 + .p2align 4,,15 +.L11: + blt 7,.L39 + mr 5,0 + b .L13 + .p2align 4,,15 +.L72: + cmpd 7,8,5 + ble 7,.L17 + mr 8,5 + b .L17 + .p2align 4,,15 +.L39: + xscpsgndp 11,44,44 + b .L13 +.L53: + li 9,1 + mtctr 9 + b .L21 +.L54: + li 10,1 + mtctr 10 + b .L35 +.L60: + li 9,1 + mtctr 9 + b .L61 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamax_k,.-isamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 +.LC6: + .long 32 + .long 32 + .long 32 + .long 32 +.LC7: + .long 64 + .long 64 + .long 64 + .long 64 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamax_power9.S b/kernel/power/isamax_power9.S new file mode 100644 index 000000000..9df1e773c --- /dev/null +++ b/kernel/power/isamax_power9.S @@ -0,0 +1,397 @@ + .file "isamax.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamax_k + .type isamax_k, @function +isamax_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamax_k,.-isamax_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + cmpdi 7,5,1 + beq 7,.L69 + rldicr. 7,11,0,61 + beq 0,.L40 + sldi 10,5,1 + sldi 6,5,2 + sldi 0,5,4 + sldi 3,5,3 + mr 9,4 + xxlxor 0,0,0 + li 8,0 + add 5,10,5 + li 10,0 + sldi 5,5,2 + .p2align 4,,15 +.L31: + lfs 12,0(9) + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L23 + fmr 0,12 + mr 8,10 +.L23: + lfsx 12,9,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L25 + fmr 0,12 + addi 8,10,1 +.L25: + lfsx 12,9,3 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L27 + fmr 0,12 + addi 8,10,2 +.L27: + lfsx 12,9,5 + add 9,9,0 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L29 + fmr 0,12 + addi 8,10,3 +.L29: + addi 10,10,4 + cmpd 7,7,10 + bgt 7,.L31 + addi 7,7,-1 + srdi 7,7,2 + addi 7,7,1 + sldi 9,7,2 + mulld 7,6,7 + cmpd 7,11,9 + ble 7,.L67 +.L22: + addi 10,9,1 + sldi 7,7,2 + subf 5,9,11 + cmpd 7,10,11 + mtctr 5 + add 4,4,7 + bgt 7,.L54 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L54 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,6 + fabs 12,12 + fcmpu 7,12,0 + bng 7,.L33 + fmr 0,12 + mr 8,9 +.L33: + addi 9,9,1 + bdnz .L35 +.L67: + addi 3,8,1 + blr + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L69: + rldicr. 10,11,0,57 + bne 0,.L70 + addi 7,10,1 + sldi 9,10,2 + subf 6,10,11 + li 8,0 + xxlxor 12,12,12 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L60 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L60 + .p2align 4,,15 +.L61: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L63 + fmr 12,0 + mr 8,10 +.L63: + addi 10,10,1 + bdnz .L61 + b .L67 + .p2align 4,,15 +.L70: + addis 6,2,.LC2@toc@ha + addis 7,2,.LC3@toc@ha + addis 8,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + xxspltib 46,0 + stxv 61,-48(1) + stxv 62,-32(1) + addi 6,6,.LC2@toc@l + addi 7,7,.LC3@toc@l + stxv 63,-16(1) + xxspltib 61,32 + xxspltib 63,16 + xxspltib 62,64 + addi 8,8,.LC4@toc@l + addi 9,9,.LC5@toc@l + lxv 47,0(6) + xxspltib 34,0 + lxv 48,0(7) + xxlor 51,46,46 + lxv 49,0(8) + lxv 50,0(9) + li 8,0 + mr 9,4 + vextsb2w 29,29 + vextsb2w 31,31 + vextsb2w 30,30 + stxv 59,-80(1) + stxv 60,-64(1) + .p2align 4,,15 +.L5: + lxv 0,0(9) + vadduwm 27,19,29 + lxv 12,240(9) + addi 8,8,64 + addi 9,9,256 + cmpd 7,10,8 + xvabssp 44,0 + lxv 0,-240(9) + xvabssp 12,12 + xvabssp 5,0 + lxv 0,-224(9) + xvabssp 32,0 + lxv 0,-208(9) + xvcmpgtsp 35,5,44 + xvabssp 9,0 + lxv 0,-192(9) + xxsel 5,44,5,35 + xxsel 35,47,48,35 + xvabssp 1,0 + lxv 0,-176(9) + xvcmpgtsp 60,9,32 + xvabssp 6,0 + lxv 0,-160(9) + xxsel 9,32,9,60 + xxsel 60,49,50,60 + xvabssp 13,0 + lxv 0,-144(9) + xvcmpgtsp 42,9,5 + xvcmpgtsp 37,6,1 + xvabssp 11,0 + lxv 0,-128(9) + xxsel 9,5,9,42 + xxsel 42,35,60,42 + xxsel 6,1,6,37 + xxsel 37,47,48,37 + xvabssp 2,0 + lxv 0,-112(9) + xvcmpgtsp 36,11,13 + xvabssp 7,0 + lxv 0,-96(9) + xxsel 11,13,11,36 + xxsel 36,49,50,36 + xvabssp 3,0 + lxv 0,-80(9) + xvcmpgtsp 45,11,6 + xvcmpgtsp 39,7,2 + xvabssp 10,0 + lxv 0,-64(9) + xxsel 7,2,7,39 + xxsel 39,47,48,39 + xvabssp 4,0 + lxv 0,-48(9) + xvcmpgtsp 38,10,3 + xvabssp 8,0 + lxv 0,-32(9) + xxsel 10,3,10,38 + xxsel 38,49,50,38 + xvabssp 0,0 + xvcmpgtsp 43,10,7 + xvcmpgtsp 41,8,4 + xvcmpgtsp 40,12,0 + xxsel 8,4,8,41 + xxsel 41,47,48,41 + xxsel 0,0,12,40 + xxsel 12,6,11,45 + xxsel 11,7,10,43 + xxsel 45,37,36,45 + xvcmpgtsp 33,0,8 + xvcmpgtsp 32,12,9 + vadduwm 13,13,31 + xxsel 40,49,50,40 + xxsel 43,39,38,43 + xxsel 0,8,0,33 + xxsel 12,9,12,32 + xxsel 33,41,40,33 + xxsel 32,42,45,32 + xvcmpgtsp 44,0,11 + vadduwm 1,1,31 + vadduwm 0,19,0 + vadduwm 19,19,30 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,0,12 + vadduwm 1,27,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,0,34 + xxsel 46,46,32,33 + xxsel 34,34,0,33 + bgt 7,.L5 + xxsldwi 12,34,34,3 + xxsldwi 11,34,34,2 + li 9,0 + li 8,12 + xxsldwi 0,34,34,1 + xscvspdp 34,34 + vextuwrx 3,9,14 + li 9,4 + xscvspdp 12,12 + xscvspdp 11,11 + xscvspdp 0,0 + vextuwrx 6,9,14 + li 9,8 + vextuwrx 7,9,14 + vextuwrx 9,8,14 + rldicl 12,6,0,32 + rldicl 8,3,0,32 + rldicl 0,7,0,32 + rldicl 5,9,0,32 + fcmpu 7,12,11 + fmr 10,0 + beq 7,.L71 + bnl 7,.L8 + mr 8,12 + fmr 12,11 +.L8: + xscmpudp 7,0,34 + bne 7,.L11 + cmplw 7,7,9 + ble 7,.L12 + mr 7,9 +.L12: + rldicl 5,7,0,32 +.L13: + fcmpu 7,12,10 + beq 7,.L72 + bnl 7,.L17 + mr 8,5 + fmr 12,10 +.L17: + cmpd 7,11,10 + ble 7,.L16 + addi 7,10,1 + sldi 9,10,2 + subf 6,10,11 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L53 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L53 + .p2align 4,,15 +.L21: + lfs 0,0(4) + addi 4,4,4 + fabs 0,0 + fcmpu 7,0,12 + bng 7,.L19 + fmr 12,0 + mr 8,10 +.L19: + addi 10,10,1 + bdnz .L21 +.L16: + lxv 59,-80(1) + lxv 60,-64(1) + addi 3,8,1 + lxv 61,-48(1) + lxv 62,-32(1) + lxv 63,-16(1) + blr + .p2align 4,,15 +.L71: + cmplw 7,3,6 + ble 7,.L7 + mr 3,6 +.L7: + rldicl 8,3,0,32 + b .L8 + .p2align 4,,15 +.L40: + sldi 6,5,2 + li 8,0 + li 9,0 + xxlxor 0,0,0 + b .L22 + .p2align 4,,15 +.L11: + blt 7,.L39 + mr 5,0 + b .L13 + .p2align 4,,15 +.L72: + cmpd 7,8,5 + ble 7,.L17 + mr 8,5 + b .L17 + .p2align 4,,15 +.L39: + xscpsgndp 10,34,34 + b .L13 +.L53: + li 9,1 + mtctr 9 + b .L21 +.L54: + li 10,1 + mtctr 10 + b .L35 +.L60: + li 9,1 + mtctr 9 + b .L61 + .long 0 + .byte 0,0,0,0,0,0,0,0 + .size isamax_k,.-isamax_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S new file mode 100644 index 000000000..3873e879b --- /dev/null +++ b/kernel/power/isamin_power8.S @@ -0,0 +1,417 @@ +/* .file "isamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamin_k + .type isamin_k, @function +*/ +#define ASSEMBLER +#include "common.h" + + PROLOGUE + +isamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamin_k,.-isamin_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 0,0(4) + li 0,-48 + cmpdi 7,5,1 + stvx 30,1,0 + li 0,-32 + stvx 31,1,0 + fabs 0,0 + beq 7,.L62 + rldicr. 6,11,0,61 + beq 0,.L40 + sldi 0,5,1 + sldi 12,5,2 + std 31,-8(1) + add 0,0,5 + neg 31,5 + sldi 3,5,4 + sldi 0,0,2 + add 7,4,12 + sldi 31,31,2 + sldi 5,5,3 + li 9,0 + li 10,0 + b .L24 + .p2align 4,,15 +.L41: + mr 10,9 +.L25: + fmr 0,12 + add 7,7,3 +.L24: + lfs 12,0(7) + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L26 + fmr 0,12 + addi 10,9,1 +.L26: + add 8,31,7 + lfsx 12,8,5 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L28 + fmr 0,12 + addi 10,9,2 +.L28: + lfsx 12,8,0 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L30 + fmr 0,12 + addi 10,9,3 +.L30: + addi 9,9,4 + cmpd 7,6,9 + ble 7,.L63 + lfsx 12,8,3 + fabs 12,12 + fcmpu 7,12,0 + blt 7,.L41 + fmr 12,0 + b .L25 + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L63: + addi 6,6,-1 + ld 31,-8(1) + srdi 6,6,2 + addi 6,6,1 + sldi 9,6,2 + mulld 6,12,6 + cmpd 7,11,9 + ble 7,.L33 +.L23: + addi 8,9,1 + sldi 6,6,2 + cmpd 7,8,11 + subf 8,9,11 + mtctr 8 + add 4,4,6 + bgt 7,.L52 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L52 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,12 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L34 + fmr 0,12 + mr 10,9 +.L34: + addi 9,9,1 + bdnz .L35 +.L33: + li 0,-48 + addi 3,10,1 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L62: + rldicr. 8,11,0,57 + li 10,0 + bne 0,.L64 +.L4: + addi 7,8,1 + sldi 9,8,2 + cmpd 7,7,11 + add 4,4,9 + subf 9,8,11 + mtctr 9 + bgt 7,.L51 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L51 + .p2align 4,,15 +.L22: + lfs 12,0(4) + addi 4,4,4 + fabs 12,12 + fcmpu 7,0,12 + bng 7,.L21 + fmr 0,12 + mr 10,8 +.L21: + addi 8,8,1 + bdnz .L22 + li 0,-48 + addi 3,10,1 + lvx 30,1,0 + li 0,-32 + lvx 31,1,0 + blr + .p2align 4,,15 +.L64: + lxvd2x 4,0,4 + addis 10,2,.LC2@toc@ha + addis 5,2,.LC3@toc@ha + std 31,-8(1) + vspltisw 2,0 + addi 10,10,.LC2@toc@l + addis 7,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + addis 6,2,.LC6@toc@ha + lxvd2x 51,0,10 + addis 10,2,.LC7@toc@ha + addi 7,7,.LC4@toc@l + addi 9,9,.LC5@toc@l + addi 5,5,.LC3@toc@l + xvabssp 4,4 + addi 6,6,.LC6@toc@l + addi 10,10,.LC7@toc@l + lxvd2x 36,0,7 + vspltisw 18,8 + lxvd2x 37,0,9 + lxvd2x 35,0,5 + mr 9,4 + li 7,0 + lxvd2x 48,0,6 + lxvd2x 49,0,10 + vadduwm 18,18,18 + xxlor 38,51,51 + xxlor 40,4,4 + b .L6 + .p2align 4,,15 +.L65: + lxvd2x 5,0,9 + xvabssp 40,5 +.L6: + addi 5,9,16 + addi 6,9,32 + vadduwm 14,2,16 + addi 10,9,64 + addi 12,9,48 + addi 31,9,80 + addi 3,9,96 + lxvd2x 5,0,5 + lxvd2x 42,0,6 + addi 5,9,112 + addi 6,9,128 + lxvd2x 44,0,10 + lxvd2x 9,0,12 + addi 10,9,160 + addi 12,9,144 + lxvd2x 6,0,31 + lxvd2x 1,0,3 + addi 31,9,176 + addi 3,9,192 + lxvd2x 11,0,5 + lxvd2x 13,0,6 + addi 5,9,208 + addi 6,9,224 + lxvd2x 2,0,10 + lxvd2x 7,0,12 + addi 10,9,240 + lxvd2x 10,0,31 + lxvd2x 3,0,3 + xvabssp 42,42 + xvabssp 5,5 + addi 7,7,64 + lxvd2x 8,0,5 + lxvd2x 0,0,6 + xvabssp 44,44 + xvabssp 9,9 + cmpd 7,8,7 + addi 9,9,256 + lxvd2x 12,0,10 + xvabssp 6,6 + xvabssp 1,1 + xvabssp 11,11 + xvabssp 13,13 + xvabssp 7,7 + xvabssp 2,2 + xvabssp 10,10 + xvabssp 3,3 + xvabssp 8,8 + xvabssp 0,0 + xvabssp 12,12 + xvcmpgtsp 32,40,5 + xvcmpgtsp 62,42,9 + xvcmpgtsp 45,44,6 + xvcmpgtsp 63,1,11 + xvcmpgtsp 39,13,7 + xvcmpgtsp 47,2,10 + xvcmpgtsp 41,3,8 + xvcmpgtsp 33,0,12 + xxsel 5,40,5,32 + xxsel 32,38,35,32 + xxsel 9,42,9,62 + xxsel 6,44,6,45 + xxsel 11,1,11,63 + xxsel 44,38,35,45 + xxsel 7,13,7,39 + xvcmpgtsp 42,5,9 + xxsel 10,2,10,47 + xvcmpgtsp 45,6,11 + xxsel 8,3,8,41 + xxsel 63,36,37,63 + xxsel 0,0,12,33 + xvcmpgtsp 43,7,10 + xxsel 40,36,37,33 + xxsel 62,36,37,62 + xvcmpgtsp 33,8,0 + xxsel 41,38,35,41 + xxsel 39,38,35,39 + xxsel 47,36,37,47 + xxsel 9,5,9,42 + xxsel 42,32,62,42 + xxsel 12,6,11,45 + xxsel 45,44,63,45 + xxsel 11,7,10,43 + xvcmpgtsp 32,9,12 + vadduwm 13,13,18 + xxsel 43,39,47,43 + xxsel 0,8,0,33 + xxsel 33,41,40,33 + xvcmpgtsp 44,11,0 + vadduwm 1,1,18 + xxsel 12,9,12,32 + xxsel 32,42,45,32 + vadduwm 0,2,0 + vadduwm 2,2,17 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,12,0 + vadduwm 1,14,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,4,0 + xxsel 51,51,32,33 + xxsel 4,4,0,33 + bgt 7,.L65 + xxsldwi 0,4,4,1 + xscvspdp 10,4 + vspltw 0,19,0 + xxsldwi 12,4,4,3 + xscvspdp 0,0 + mfvsrwz 3,51 + mfvsrwz 6,32 + vspltw 0,19,3 + xscvspdp 12,12 + xxsldwi 4,4,4,2 + mfvsrwz 7,32 + vspltw 0,19,2 + xscvspdp 4,4 + mfvsrwz 9,32 + fcmpu 7,0,10 + rldicl 10,3,0,32 + rldicl 31,6,0,32 + fmr 11,12 + rldicl 5,7,0,32 + rldicl 0,9,0,32 + beq 7,.L66 + bng 7,.L9 + fmr 0,10 + mr 10,31 +.L9: + fcmpu 7,12,4 + bne 7,.L12 + cmplw 7,7,9 + ble 7,.L13 + mr 7,9 +.L13: + rldicl 5,7,0,32 +.L14: + fcmpu 7,0,11 + beq 7,.L67 + bng 7,.L19 + fmr 0,11 + mr 10,5 +.L19: + cmpd 7,11,8 + ld 31,-8(1) + bgt 7,.L4 + b .L33 + .p2align 4,,15 +.L66: + cmplw 7,3,6 + ble 7,.L8 + mr 3,6 +.L8: + rldicl 10,3,0,32 + b .L9 + .p2align 4,,15 +.L40: + sldi 12,5,2 + li 10,0 + li 9,0 + b .L23 + .p2align 4,,15 +.L12: + bng 7,.L14 + fmr 11,4 + mr 5,0 + b .L14 + .p2align 4,,15 +.L67: + cmpd 7,10,5 + ble 7,.L19 + mr 10,5 + b .L19 +.L51: + li 9,1 + mtctr 9 + b .L22 +.L52: + li 8,1 + mtctr 8 + b .L35 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamin_k,.-isamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 +.LC6: + .long 32 + .long 32 + .long 32 + .long 32 +.LC7: + .long 64 + .long 64 + .long 64 + .long 64 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/kernel/power/isamin_power9.S b/kernel/power/isamin_power9.S new file mode 100644 index 000000000..0475edf46 --- /dev/null +++ b/kernel/power/isamin_power9.S @@ -0,0 +1,382 @@ + .file "isamin.c" + .abiversion 2 + .section ".text" + .align 2 + .p2align 4,,15 + .globl isamin_k + .type isamin_k, @function +isamin_k: +.LCF0: +0: addis 2,12,.TOC.-.LCF0@ha + addi 2,2,.TOC.-.LCF0@l + .localentry isamin_k,.-isamin_k + mr. 11,3 + ble 0,.L36 + cmpdi 7,5,0 + li 3,0 + blelr 7 + lfs 0,0(4) + cmpdi 7,5,1 + stxv 61,-64(1) + stxv 62,-48(1) + stxv 63,-32(1) + fabs 0,0 + beq 7,.L62 + rldicr. 6,11,0,61 + beq 0,.L40 + sldi 8,5,1 + sldi 0,5,2 + neg 12,5 + std 31,-8(1) + sldi 3,5,4 + sldi 31,5,3 + li 9,0 + li 10,0 + add 5,8,5 + add 7,4,0 + sldi 12,12,2 + sldi 5,5,2 + b .L24 + .p2align 4,,15 +.L41: + mr 10,9 +.L25: + add 7,7,3 + fmr 0,12 +.L24: + lfs 12,0(7) + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L26 + fmr 0,12 + addi 10,9,1 +.L26: + add 8,7,12 + lfsx 12,8,31 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L28 + fmr 0,12 + addi 10,9,2 +.L28: + lfsx 12,8,5 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L30 + fmr 0,12 + addi 10,9,3 +.L30: + addi 9,9,4 + cmpd 7,6,9 + ble 7,.L63 + lfsx 12,8,3 + fabs 12,12 + fcmpu 7,12,0 + blt 7,.L41 + fmr 12,0 + b .L25 + .p2align 4,,15 +.L36: + li 3,0 + blr + .p2align 4,,15 +.L63: + addi 6,6,-1 + ld 31,-8(1) + srdi 6,6,2 + addi 6,6,1 + sldi 9,6,2 + mulld 6,0,6 + cmpd 7,11,9 + ble 7,.L33 +.L23: + addi 8,9,1 + sldi 6,6,2 + subf 7,9,11 + cmpd 7,8,11 + mtctr 7 + add 4,4,6 + bgt 7,.L52 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L52 + .p2align 4,,15 +.L35: + lfs 12,0(4) + add 4,4,0 + fabs 12,12 + fcmpu 7,12,0 + bnl 7,.L34 + fmr 0,12 + mr 10,9 +.L34: + addi 9,9,1 + bdnz .L35 +.L33: + lxv 61,-64(1) + lxv 62,-48(1) + addi 3,10,1 + lxv 63,-32(1) + blr + .p2align 4,,15 +.L62: + rldicr. 8,11,0,57 + li 10,0 + bne 0,.L64 +.L4: + addi 7,8,1 + sldi 9,8,2 + subf 6,8,11 + cmpd 7,7,11 + mtctr 6 + add 4,4,9 + bgt 7,.L51 + li 3,-1 + rldicr 3,3,0,0 + cmpd 7,11,3 + beq 7,.L51 + .p2align 4,,15 +.L22: + lfs 12,0(4) + addi 4,4,4 + fabs 12,12 + fcmpu 7,0,12 + bng 7,.L21 + fmr 0,12 + mr 10,8 +.L21: + addi 8,8,1 + bdnz .L22 + lxv 61,-64(1) + lxv 62,-48(1) + addi 3,10,1 + lxv 63,-32(1) + blr + .p2align 4,,15 +.L64: + lxv 0,0(4) + xxspltib 47,16 + addis 6,2,.LC2@toc@ha + addis 7,2,.LC3@toc@ha + addis 10,2,.LC4@toc@ha + addis 9,2,.LC5@toc@ha + xxspltib 63,32 + xxspltib 46,64 + addi 6,6,.LC2@toc@l + addi 10,10,.LC4@toc@l + addi 7,7,.LC3@toc@l + std 31,-8(1) + addi 9,9,.LC5@toc@l + xxspltib 50,0 + vextsb2w 15,15 + lxv 48,0(6) + lxv 51,0(10) + vextsb2w 31,31 + vextsb2w 14,14 + xvabssp 4,0 + lxv 34,0(9) + lxv 49,0(7) + mr 9,4 + li 10,0 + xxlor 35,48,48 + xxlor 40,4,4 + b .L6 + .p2align 4,,15 +.L65: + lxv 0,0(9) + xvabssp 40,0 +.L6: + lxv 0,16(9) + vadduwm 29,18,31 + lxv 12,240(9) + addi 10,10,64 + addi 9,9,256 + cmpd 7,8,10 + xvabssp 5,0 + lxv 0,-224(9) + xvabssp 12,12 + xvabssp 32,0 + lxv 0,-208(9) + xvcmpgtsp 42,40,5 + xvabssp 9,0 + lxv 0,-192(9) + xxsel 5,40,5,42 + xvabssp 44,0 + lxv 0,-176(9) + xvcmpgtsp 62,32,9 + xvabssp 6,0 + lxv 0,-160(9) + xxsel 9,32,9,62 + xxsel 32,35,49,42 + xvabssp 1,0 + lxv 0,-144(9) + xxsel 62,51,34,62 + xvcmpgtsp 42,5,9 + xvcmpgtsp 37,44,6 + xvabssp 11,0 + lxv 0,-128(9) + xxsel 9,5,9,42 + xxsel 42,32,62,42 + xxsel 6,44,6,37 + xxsel 37,35,49,37 + xvabssp 13,0 + lxv 0,-112(9) + xvcmpgtsp 36,1,11 + xvabssp 7,0 + lxv 0,-96(9) + xxsel 11,1,11,36 + xxsel 36,51,34,36 + xvabssp 2,0 + lxv 0,-80(9) + xvcmpgtsp 45,6,11 + xvcmpgtsp 39,13,7 + xvabssp 10,0 + lxv 0,-64(9) + xxsel 7,13,7,39 + xxsel 39,35,49,39 + xvabssp 3,0 + lxv 0,-48(9) + xvcmpgtsp 38,2,10 + xvabssp 8,0 + lxv 0,-32(9) + xxsel 10,2,10,38 + xxsel 38,51,34,38 + xvabssp 0,0 + xvcmpgtsp 43,7,10 + xvcmpgtsp 41,3,8 + xvcmpgtsp 33,0,12 + xxsel 8,3,8,41 + xxsel 41,35,49,41 + xxsel 0,0,12,33 + xxsel 40,51,34,33 + xxsel 12,6,11,45 + xxsel 11,7,10,43 + xvcmpgtsp 33,8,0 + xxsel 45,37,36,45 + xvcmpgtsp 32,9,12 + xxsel 43,39,38,43 + vadduwm 13,13,15 + xxsel 0,8,0,33 + xxsel 33,41,40,33 + xxsel 12,9,12,32 + xxsel 32,42,45,32 + xvcmpgtsp 44,11,0 + vadduwm 1,1,15 + vadduwm 0,18,0 + vadduwm 18,18,14 + xxsel 0,11,0,44 + xxsel 33,43,33,44 + xvcmpgtsp 45,12,0 + vadduwm 1,29,1 + xxsel 0,12,0,45 + xxsel 32,32,33,45 + xvcmpgtsp 33,4,0 + xxsel 48,48,32,33 + xxsel 4,4,0,33 + bgt 7,.L65 + xxsldwi 0,4,4,3 + xxsldwi 11,4,4,2 + li 9,0 + li 10,12 + xxsldwi 12,4,4,1 + xscvspdp 4,4 + vextuwrx 3,9,16 + li 9,4 + xscvspdp 0,0 + xscvspdp 11,11 + xscvspdp 12,12 + vextuwrx 6,9,16 + li 9,8 + vextuwrx 7,9,16 + vextuwrx 9,10,16 + rldicl 31,6,0,32 + rldicl 10,3,0,32 + rldicl 5,7,0,32 + rldicl 0,9,0,32 + fcmpu 7,0,11 + fmr 10,12 + beq 7,.L66 + bng 7,.L9 + mr 10,31 + fmr 0,11 +.L9: + fcmpu 7,12,4 + bne 7,.L12 + cmplw 7,7,9 + ble 7,.L13 + mr 7,9 +.L13: + rldicl 5,7,0,32 +.L14: + fcmpu 7,0,10 + beq 7,.L67 + bng 7,.L19 + mr 10,5 + fmr 0,10 +.L19: + cmpd 7,11,8 + ld 31,-8(1) + bgt 7,.L4 + b .L33 + .p2align 4,,15 +.L66: + cmplw 7,3,6 + ble 7,.L8 + mr 3,6 +.L8: + rldicl 10,3,0,32 + b .L9 + .p2align 4,,15 +.L40: + sldi 0,5,2 + li 10,0 + li 9,0 + b .L23 + .p2align 4,,15 +.L12: + bng 7,.L14 + mr 5,0 + fmr 10,4 + b .L14 + .p2align 4,,15 +.L67: + cmpd 7,10,5 + ble 7,.L19 + mr 10,5 + b .L19 +.L51: + li 9,1 + mtctr 9 + b .L22 +.L52: + li 8,1 + mtctr 8 + b .L35 + .long 0 + .byte 0,0,0,0,0,1,0,0 + .size isamin_k,.-isamin_k + .section .rodata.cst16,"aM",@progbits,16 + .align 4 +.LC2: + .long 0 + .long 1 + .long 2 + .long 3 +.LC3: + .long 4 + .long 5 + .long 6 + .long 7 +.LC4: + .long 8 + .long 9 + .long 10 + .long 11 +.LC5: + .long 12 + .long 13 + .long 14 + .long 15 + .ident "GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]" + .section .note.GNU-stack,"",@progbits diff --git a/param.h b/param.h index 5fbdbcdcd..0ff59f400 100644 --- a/param.h +++ b/param.h @@ -2636,15 +2636,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 4 #define ZGEMM_DEFAULT_UNROLL_N 4 -#define SGEMM_DEFAULT_P 512 -#define DGEMM_DEFAULT_P 256 -#define CGEMM_DEFAULT_P 256 -#define ZGEMM_DEFAULT_P 128 +/*FIXME: this should be using the cache size, but there is currently no easy way to +query that on ARM. So if getarch counted more than 8 cores we simply assume the host +is a big desktop or server with abundant cache rather than a phone or embedded device */ +#if NUM_CORES > 8 + #define SGEMM_DEFAULT_P 512 + #define DGEMM_DEFAULT_P 256 + #define CGEMM_DEFAULT_P 256 + #define ZGEMM_DEFAULT_P 128 -#define SGEMM_DEFAULT_Q 1024 -#define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 512 -#define ZGEMM_DEFAULT_Q 512 + #define SGEMM_DEFAULT_Q 1024 + #define DGEMM_DEFAULT_Q 512 + #define CGEMM_DEFAULT_Q 512 + #define ZGEMM_DEFAULT_Q 512 +#else + #define SGEMM_DEFAULT_P 128 + #define DGEMM_DEFAULT_P 160 + #define CGEMM_DEFAULT_P 128 + #define ZGEMM_DEFAULT_P 128 + + #define SGEMM_DEFAULT_Q 352 + #define DGEMM_DEFAULT_Q 128 + #define CGEMM_DEFAULT_Q 224 + #define ZGEMM_DEFAULT_Q 112 +#endif #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096