diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3859a9c19..fd759913d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -171,3 +171,11 @@ In chronological order: * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes * [2019-03-14] power9 dgemm/dtrmm kernel * [2019-04-29] power9 sgemm/strmm kernel + +* Jiachen Wang + * [2019-07-29] optimize AVX2 DGEMM + * [2019-10-20] AVX512 DGEMM kernel (4x8) + * [2019-11-06] optimize AVX512 SGEMM + * [2019-11-12] AVX512 CGEMM & ZGEMM kernels + * [2019-12-23] optimize AVX2 CGEMM and ZGEMM + * [2019-12-27] AVX2 CGEMM3M kernel diff --git a/Makefile.arm64 b/Makefile.arm64 index 4d10ff684..c17ea7938 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif +ifeq ($(GCCVERSIONGTEQ9), 1) ifeq ($(CORE), TSV110) CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 endif +endif + diff --git a/Makefile.system b/Makefile.system index 4cb4dc954..ab2ffca52 100644 --- a/Makefile.system +++ b/Makefile.system @@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) +GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) ifeq ($(GCCVERSIONGT4), 1) # GCC Major version > 4 @@ -547,9 +548,14 @@ endif ifeq ($(ARCH), arm64) DYNAMIC_CORE = ARMV8 +DYNAMIC_CORE += CORTEXA53 DYNAMIC_CORE += CORTEXA57 +DYNAMIC_CORE += CORTEXA72 +DYNAMIC_CORE += CORTEXA73 +DYNAMIC_CORE += FALKOR DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 +DYNAMIC_CORE += TSV110 endif ifeq ($(ARCH), power) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index f3ae84fe0..8280d6274 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -45,7 +45,11 @@ endif () if (DYNAMIC_ARCH) if (ARM64) - set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) + set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110) + endif () + + if (POWER) + set(DYNAMIC_CORE POWER6 POWER8 POWER9) endif () if (X86) diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 086df1943..c6d109356 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_N 4) set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "TSV110") + file(APPEND ${TARGET_CONF_TEMP} + "#define ARMV8\n" + "#define L1_CODE_SIZE\t65536\n" + "#define L1_CODE_LINESIZE\t64\n" + "#define L1_CODE_ASSOCIATIVE\t4\n" + "#define L1_DATA_SIZE\t65536\n" + "#define L1_DATA_LINESIZE\t64\n" + "#define L1_DATA_ASSOCIATIVE\t4\n" + "#define L2_SIZE\t524288\n" + "#define L2_LINESIZE\t64\n" + "#define L2_ASSOCIATIVE\t8\n" + "#define DTB_DEFAULT_ENTRIES\t64\n" + "#define DTB_SIZE\t4096\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 8) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 4) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 16) + elseif ("${TCORE}" STREQUAL "POWER6") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 4) + set(SGEMM_UNROLL_N 4) + set(DGEMM_UNROLL_M 4) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 2) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 2) + set(ZGEMM_UNROLL_N 4) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER8") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) + elseif ("${TCORE}" STREQUAL "POWER9") + file(APPEND ${TARGET_CONF_TEMP} + "#define L1_DATA_SIZE 32768\n" + "#define L1_DATA_LINESIZE 128\n" + "#define L2_SIZE 524288\n" + "#define L2_LINESIZE 128 \n" + "#define DTB_DEFAULT_ENTRIES 128\n" + "#define DTB_SIZE 4096\n" + "#define L2_ASSOCIATIVE 8\n") + set(SGEMM_UNROLL_M 16) + set(SGEMM_UNROLL_N 8) + set(DGEMM_UNROLL_M 16) + set(DGEMM_UNROLL_N 4) + set(CGEMM_UNROLL_M 8) + set(CGEMM_UNROLL_N 4) + set(ZGEMM_UNROLL_M 8) + set(ZGEMM_UNROLL_N 2) + set(SYMV_P 8) endif() # Or should this actually be NUM_CORES? diff --git a/common_power.h b/common_power.h index bcfc209a9..e7caf9adf 100644 --- a/common_power.h +++ b/common_power.h @@ -39,6 +39,35 @@ #ifndef COMMON_POWER #define COMMON_POWER +#define str(x) #x + +#ifdef OS_AIX +#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z +#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00 +#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11 +#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10 +#define XVMOVDP(T,A) xvcpsgndp T, A, A + +#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t" +#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t" +#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t" +#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t" + +#else +#define XXSPLTD(T,A,z) xxspltd T, A, z +#define XXMRGHD(T,A,B) xxmrghd T, A, B +#define XXMRGLD(T,A,B) xxmrgld T, A, B +#define XXSWAPD(T,A) xxswapd T, A +#define XVMOVDP(T,A) xvmovdp T, A + +#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t" +#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t" +#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t" + +#endif + + #if defined(POWER8) || defined(POWER9) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c index bbde7e5d1..d037e72cd 100644 --- a/driver/level3/gemm3m_level3.c +++ b/driver/level3/gemm3m_level3.c @@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); @@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; - if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; + if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3; START_RPCC(); diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index e27725baf..5ecc4428b 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){ for(i = 0; i < blas_num_threads - 1; i++){ // Could also just use WaitForMultipleObjects - DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); + DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index a4ff0e086..2e87e186a 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){ } return NULL; case 7: + if (model == 10) // Goldmont Plus + return &gotoblas_NEHALEM; if (model == 14) { // Ice Lake if (support_avx512()) diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index 9db9ba17d..72f5fcca2 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -43,13 +43,18 @@ #endif extern gotoblas_t gotoblas_ARMV8; +extern gotoblas_t gotoblas_CORTEXA53; extern gotoblas_t gotoblas_CORTEXA57; +extern gotoblas_t gotoblas_CORTEXA72; +extern gotoblas_t gotoblas_CORTEXA73; +extern gotoblas_t gotoblas_FALKOR; extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX2T99; +extern gotoblas_t gotoblas_TSV110; extern void openblas_warning(int verbose, const char * msg); -#define NUM_CORETYPES 4 +#define NUM_CORETYPES 9 /* * In case asm/hwcap.h is outdated on the build system, make sure @@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg); static char *corename[] = { "armv8", + "cortexa53", "cortexa57", + "cortexa72", + "cortexa73", + "falkor", "thunderx", "thunderx2t99", + "tsv110", "unknown" }; char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; - if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; - if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; - if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1]; + if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2]; + if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3]; + if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4]; + if (gotoblas == &gotoblas_FALKOR) return corename[ 5]; + if (gotoblas == &gotoblas_THUNDERX) return corename[ 6]; + if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7]; + if (gotoblas == &gotoblas_TSV110) return corename[ 8]; return corename[NUM_CORETYPES]; } @@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) { switch (found) { case 0: return (&gotoblas_ARMV8); - case 1: return (&gotoblas_CORTEXA57); - case 2: return (&gotoblas_THUNDERX); - case 3: return (&gotoblas_THUNDERX2T99); + case 1: return (&gotoblas_CORTEXA53); + case 2: return (&gotoblas_CORTEXA57); + case 3: return (&gotoblas_CORTEXA72); + case 4: return (&gotoblas_CORTEXA73); + case 5: return (&gotoblas_FALKOR); + case 6: return (&gotoblas_THUNDERX); + case 7: return (&gotoblas_THUNDERX2T99); + case 8: return (&gotoblas_TSV110); } snprintf(message, 128, "Core not found: %s\n", coretype); openblas_warning(1, message); @@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) { case 0x41: // ARM switch (part) { - case 0xd07: // Cortex A57 - case 0xd08: // Cortex A72 case 0xd03: // Cortex A53 + return &gotoblas_CORTEXA53; + case 0xd07: // Cortex A57 return &gotoblas_CORTEXA57; + case 0xd08: // Cortex A72 + return &gotoblas_CORTEXA72; + case 0xd09: // Cortex A73 + return &gotoblas_CORTEXA73; } break; case 0x42: // Broadcom @@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_THUNDERX2T99; } break; + case 0x48: // HiSilicon + switch (part) + { + case 0xd01: // tsv110 + return &gotoblas_TSV110; + } + break; + case 0x51: // Qualcomm + switch (part) + { + case 0xc00: // Falkor + return &gotoblas_FALKOR; + } + break; } return NULL; } diff --git a/dynamic.c b/dynamic.c deleted file mode 100644 index aa2b87621..000000000 --- a/dynamic.c +++ /dev/null @@ -1,897 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include "common.h" - -#ifdef _MSC_VER -#define strncasecmp _strnicmp -#define strcasecmp _stricmp -#endif - -#ifdef ARCH_X86 -#define EXTERN extern -#else -#define EXTERN -#endif - -#ifdef DYNAMIC_LIST -extern gotoblas_t gotoblas_PRESCOTT; - -#ifdef DYN_ATHLON -extern gotoblas_t gotoblas_ATHLON; -#else -#define gotoblas_ATHLON gotoblas_PRESCOTT -#endif -#ifdef DYN_KATMAI -extern gotoblas_t gotoblas_KATMAI; -#else -#define gotoblas_KATMAI gotoblas_PRESCOTT -#endif -#ifdef DYN_BANIAS -extern gotoblas_t gotoblas_BANIAS; -#else -#define gotoblas_BANIAS gotoblas_PRESCOTT -#endif -#ifdef DYN_COPPERMINE -extern gotoblas_t gotoblas_COPPERMINE; -#else -#define gotoblas_COPPERMINE gotoblas_PRESCOTT -#endif -#ifdef DYN_NORTHWOOD -extern gotoblas_t gotoblas_NORTHWOOD; -#else -#define gotoblas_NORTHWOOD gotoblas_PRESCOTT -#endif -#ifdef DYN_CORE2 -extern gotoblas_t gotoblas_CORE2; -#else -#define gotoblas_CORE2 gotoblas_PRESCOTT -#endif -#ifdef DYN_NEHALEM -extern gotoblas_t gotoblas_NEHALEM; -#else -#define gotoblas_NEHALEM gotoblas_PRESCOTT -#endif -#ifdef DYN_BARCELONA -extern gotoblas_t gotoblas_BARCELONA; -#elif defined(DYN_NEHALEM) -#define gotoblas_BARCELONA gotoblas_NEHALEM -#else -#define gotoblas_BARCELONA gotoblas_PRESCOTT -#endif -#ifdef DYN_ATOM -extern gotoblas_t gotoblas_ATOM; -elif defined(DYN_NEHALEM) -#define gotoblas_ATOM gotoblas_NEHALEM -#else -#define gotoblas_ATOM gotoblas_PRESCOTT -#endif -#ifdef DYN_NANO -extern gotoblas_t gotoblas_NANO; -#else -#define gotoblas_NANO gotoblas_PRESCOTT -#endif -#ifdef DYN_PENRYN -extern gotoblas_t gotoblas_PENRYN; -#else -#define gotoblas_PENRYN gotoblas_PRESCOTT -#endif -#ifdef DYN_DUNNINGTON -extern gotoblas_t gotoblas_DUNNINGTON; -#else -#define gotoblas_DUNNINGTON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON -extern gotoblas_t gotoblas_OPTERON; -#else -#define gotoblas_OPTERON gotoblas_PRESCOTT -#endif -#ifdef DYN_OPTERON_SSE3 -extern gotoblas_t gotoblas_OPTERON_SSE3; -#else -#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT -#endif -#ifdef DYN_BOBCAT -extern gotoblas_t gotoblas_BOBCAT; -#elif defined(DYN_NEHALEM) -#define gotoblas_BOBCAT gotoblas_NEHALEM -#else -#define gotoblas_BOBCAT gotoblas_PRESCOTT -#endif -#ifdef DYN_SANDYBRIDGE -extern gotoblas_t gotoblas_SANDYBRIDGE; -#elif defined(DYN_NEHALEM) -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#else -#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT -#endif -#ifdef DYN_BULLDOZER -extern gotoblas_t gotoblas_BULLDOZER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_BULLDOZER gotoblas_NEHALEM -#else -#define gotoblas_BULLDOZER gotoblas_PRESCOTT -#endif -#ifdef DYN_PILEDRIVER -extern gotoblas_t gotoblas_PILEDRIVER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_PILEDRIVER gotoblas_NEHALEM -#else -#define gotoblas_PILEDRIVER gotoblas_PRESCOTT -#endif -#ifdef DYN_STEAMROLLER -extern gotoblas_t gotoblas_STEAMROLLER; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_STEAMROLLER gotoblas_NEHALEM -#else -#define gotoblas_STEAMROLLER gotoblas_PRESCOTT -#endif -#ifdef DYN_EXCAVATOR -extern gotoblas_t gotoblas_EXCAVATOR; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_EXCAVATOR gotoblas_NEHALEM -#else -#define gotoblas_EXCAVATOR gotoblas_PRESCOTT -#endif -#ifdef DYN_HASWELL -extern gotoblas_t gotoblas_HASWELL; -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_HASWELL gotoblas_NEHALEM -#else -#define gotoblas_HASWELL gotoblas_PRESCOTT -#endif -#ifdef DYN_ZEN -extern gotoblas_t gotoblas_ZEN; -#elif defined(DYN_HASWELL) -#define gotoblas_ZEN gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_ZEN gotoblas_NEHALEM -#else -#define gotoblas_ZEN gotoblas_PRESCOTT -#endif -#ifdef DYN_SKYLAKEX -extern gotoblas_t gotoblas_SKYLAKEX; -#elif defined(DYN_HASWELL) -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#elif defined(DYN_SANDYBRIDGE) -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#elif defined(DYN_NEHALEM) -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#else -#define gotoblas_SKYLAKEX gotoblas_PRESCOTT -#endif - - -#else // not DYNAMIC_LIST -EXTERN gotoblas_t gotoblas_KATMAI; -EXTERN gotoblas_t gotoblas_COPPERMINE; -EXTERN gotoblas_t gotoblas_NORTHWOOD; -EXTERN gotoblas_t gotoblas_BANIAS; -EXTERN gotoblas_t gotoblas_ATHLON; - -extern gotoblas_t gotoblas_PRESCOTT; -extern gotoblas_t gotoblas_CORE2; -extern gotoblas_t gotoblas_NEHALEM; -extern gotoblas_t gotoblas_BARCELONA; -#ifdef DYNAMIC_OLDER -extern gotoblas_t gotoblas_ATOM; -extern gotoblas_t gotoblas_NANO; -extern gotoblas_t gotoblas_PENRYN; -extern gotoblas_t gotoblas_DUNNINGTON; -extern gotoblas_t gotoblas_OPTERON; -extern gotoblas_t gotoblas_OPTERON_SSE3; -extern gotoblas_t gotoblas_BOBCAT; -#else -#define gotoblas_ATOM gotoblas_NEHALEM -#define gotoblas_NANO gotoblas_NEHALEM -#define gotoblas_PENRYN gotoblas_CORE2 -#define gotoblas_DUNNINGTON gotoblas_CORE2 -#define gotoblas_OPTERON gotoblas_CORE2 -#define gotoblas_OPTERON_SSE3 gotoblas_CORE2 -#define gotoblas_BOBCAT gotoblas_CORE2 -#endif - -#ifndef NO_AVX -extern gotoblas_t gotoblas_SANDYBRIDGE; -extern gotoblas_t gotoblas_BULLDOZER; -extern gotoblas_t gotoblas_PILEDRIVER; -extern gotoblas_t gotoblas_STEAMROLLER; -extern gotoblas_t gotoblas_EXCAVATOR; -#ifdef NO_AVX2 -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE -#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE -#define gotoblas_ZEN gotoblas_SANDYBRIDGE -#else -extern gotoblas_t gotoblas_HASWELL; -extern gotoblas_t gotoblas_ZEN; -#ifndef NO_AVX512 -extern gotoblas_t gotoblas_SKYLAKEX; -#else -#define gotoblas_SKYLAKEX gotoblas_HASWELL -#endif -#endif -#else -//Use NEHALEM kernels for sandy bridge -#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM -#define gotoblas_HASWELL gotoblas_NEHALEM -#define gotoblas_SKYLAKEX gotoblas_NEHALEM -#define gotoblas_BULLDOZER gotoblas_BARCELONA -#define gotoblas_PILEDRIVER gotoblas_BARCELONA -#define gotoblas_STEAMROLLER gotoblas_BARCELONA -#define gotoblas_EXCAVATOR gotoblas_BARCELONA -#define gotoblas_ZEN gotoblas_BARCELONA -#endif - -#endif // DYNAMIC_LIST - -#define VENDOR_INTEL 1 -#define VENDOR_AMD 2 -#define VENDOR_CENTAUR 3 -#define VENDOR_HYGON 4 -#define VENDOR_UNKNOWN 99 - -#define BITMASK(a, b, c) ((((a) >> (b)) & (c))) - -#ifndef NO_AVX -static inline void xgetbv(int op, int * eax, int * edx){ - //Use binary code for xgetbv - __asm__ __volatile__ - (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); -} -#endif - -int support_avx(){ -#ifndef NO_AVX - int eax, ebx, ecx, edx; - int ret=0; - - cpuid(1, &eax, &ebx, &ecx, &edx); - if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 6) == 6){ - ret=1; //OS support AVX - } - } - return ret; -#else - return 0; -#endif -} - -int support_avx2(){ -#ifndef NO_AVX2 - int eax, ebx, ecx=0, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 - return ret; -#else - return 0; -#endif -} - -int support_avx512(){ -#if !defined(NO_AVX) && !defined(NO_AVX512) - int eax, ebx, ecx, edx; - int ret=0; - - if (!support_avx()) - return 0; - cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 1){ - ret=0; //OS does not even support AVX2 - } - if((ebx & (1<<31)) != 0){ - xgetbv(0, &eax, &edx); - if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } - return ret; -#else - return 0; -#endif -} - -extern void openblas_warning(int verbose, const char * msg); -#define FALLBACK_VERBOSE 1 -#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" -#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" -#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" -#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" - -static int get_vendor(void){ - int eax, ebx, ecx, edx; - - union - { - char vchar[16]; - int vint[4]; - } vendor; - - cpuid(0, &eax, &ebx, &ecx, &edx); - - *(&vendor.vint[0]) = ebx; - *(&vendor.vint[1]) = edx; - *(&vendor.vint[2]) = ecx; - - vendor.vchar[12] = '\0'; - - if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; - if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; - if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; - if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; - - if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; - - return VENDOR_UNKNOWN; -} - -static gotoblas_t *get_coretype(void){ - - int eax, ebx, ecx, edx; - int family, exfamily, model, vendor, exmodel; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - family = BITMASK(eax, 8, 0x0f); - exfamily = BITMASK(eax, 20, 0xff); - model = BITMASK(eax, 4, 0x0f); - exmodel = BITMASK(eax, 16, 0x0f); - - vendor = get_vendor(); - - if (vendor == VENDOR_INTEL){ - switch (family) { - case 0x6: - switch (exmodel) { - case 0: - if (model <= 0x7) return &gotoblas_KATMAI; - if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE; - if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS; - if (model == 14) return &gotoblas_BANIAS; - if (model == 15) return &gotoblas_CORE2; - return NULL; - - case 1: - if (model == 6) return &gotoblas_CORE2; - if (model == 7) return &gotoblas_PENRYN; - if (model == 13) return &gotoblas_DUNNINGTON; - if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM; - if (model == 12) return &gotoblas_ATOM; - return NULL; - - case 2: - //Intel Core (Clarkdale) / Core (Arrandale) - // Pentium (Clarkdale) / Pentium Mobile (Arrandale) - // Xeon (Clarkdale), 32nm - if (model == 5) return &gotoblas_NEHALEM; - - //Intel Xeon Processor 5600 (Westmere-EP) - //Xeon Processor E7 (Westmere-EX) - //Xeon E7540 - if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; - - //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - //Intel Core i7-3000 / Xeon E5 - if (model == 10 || model == 13) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - case 3: - //Intel Sandy Bridge 22nm (Ivy Bridge?) - if (model == 10 || model == 14) { - if(support_avx()) - return &gotoblas_SANDYBRIDGE; - else{ - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Haswell - if (model == 12 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 13) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 7) return &gotoblas_ATOM; //Bay Trail - return NULL; - case 4: - //Intel Haswell - if (model == 5 || model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Broadwell - if (model == 7 || model == 15) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Braswell / Avoton - if (model == 12 || model == 13) { - return &gotoblas_NEHALEM; - } - return NULL; - case 5: - //Intel Broadwell - if (model == 6) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - if (model == 5) { - // Intel Skylake X - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - //Intel Skylake - if (model == 14) { - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Intel Phi Knights Landing - if (model == 7) { - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - //Apollo Lake or Denverton - if (model == 12 || model == 15) { - return &gotoblas_NEHALEM; - } - return NULL; - case 6: - if (model == 6) { - // Cannon Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 7: - if (model == 10) // Goldmont plus - return &gotoblas_NEHALEM; - if (model == 14) { - // Ice Lake - if (support_avx512()) - return &gotoblas_SKYLAKEX; - if(support_avx2()){ - openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); - return &gotoblas_HASWELL; - } - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; - } - } - return NULL; - case 9: - case 8: - if (model == 14 ) { // Kaby Lake, Coffee Lake - if(support_avx2()) - return &gotoblas_HASWELL; - if(support_avx()) { - openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); - return &gotoblas_SANDYBRIDGE; - } else { - openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); - return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. - } - } - return NULL; - } - case 0xf: - if (model <= 0x2) return &gotoblas_NORTHWOOD; - return &gotoblas_PRESCOTT; - } - } - - if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ - if (family <= 0xe) { - // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon - cpuid(0x80000000, &eax, &ebx, &ecx, &edx); - if ( (eax & 0xffff) >= 0x01) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) - return NULL; - } - else - return NULL; - - return &gotoblas_ATHLON; - } - if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) { - if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; - else return &gotoblas_OPTERON; - } else if (exfamily == 5) { - return &gotoblas_BOBCAT; - } else if (exfamily == 6) { - if(model == 1){ - //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - if(support_avx()) - return &gotoblas_BULLDOZER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 2 || model == 3){ - //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300 - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 5){ - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if(model == 0 || model == 8){ - if (exmodel == 1) { - //AMD Trinity - if(support_avx()) - return &gotoblas_PILEDRIVER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 3) { - //AMD STEAMROLLER - if(support_avx()) - return &gotoblas_STEAMROLLER; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else if (exmodel == 6) { - if(support_avx()) - return &gotoblas_EXCAVATOR; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - - } - } - } else if (exfamily == 8) { - if (model == 1 || model == 8) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - } - } else if (exfamily == 9) { - if(support_avx()) - return &gotoblas_ZEN; - else{ - openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); - return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. - } - }else { - return &gotoblas_BARCELONA; - } - } - } - - if (vendor == VENDOR_CENTAUR) { - switch (family) { - case 0x6: - return &gotoblas_NANO; - } - } - - return NULL; -} - -static char *corename[] = { - "Unknown", - "Katmai", - "Coppermine", - "Northwood", - "Prescott", - "Banias", - "Atom", - "Core2", - "Penryn", - "Dunnington", - "Nehalem", - "Athlon", - "Opteron", - "Opteron_SSE3", - "Barcelona", - "Nano", - "Sandybridge", - "Bobcat", - "Bulldozer", - "Piledriver", - "Haswell", - "Steamroller", - "Excavator", - "Zen", - "SkylakeX" -}; - -char *gotoblas_corename(void) { - - if (gotoblas == &gotoblas_KATMAI) return corename[ 1]; - if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2]; - if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; - if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; - if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; - if (gotoblas == &gotoblas_ATOM) return corename[ 6]; - if (gotoblas == &gotoblas_CORE2) return corename[ 7]; - if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; - if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; - if (gotoblas == &gotoblas_NEHALEM) return corename[10]; - if (gotoblas == &gotoblas_ATHLON) return corename[11]; - if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; - if (gotoblas == &gotoblas_OPTERON) return corename[13]; - if (gotoblas == &gotoblas_BARCELONA) return corename[14]; - if (gotoblas == &gotoblas_NANO) return corename[15]; - if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; - if (gotoblas == &gotoblas_BOBCAT) return corename[17]; - if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; - if (gotoblas == &gotoblas_HASWELL) return corename[20]; - if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; - if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; - if (gotoblas == &gotoblas_ZEN) return corename[23]; - if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; - return corename[0]; -} - - -static gotoblas_t *force_coretype(char *coretype){ - - int i ; - int found = -1; - char message[128]; - //char mname[20]; - - for ( i=1 ; i <= 24; i++) - { - if (!strncasecmp(coretype,corename[i],20)) - { - found = i; - break; - } - } - if (found < 0) - { - //strncpy(mname,coretype,20); - snprintf(message, 128, "Core not found: %s\n",coretype); - openblas_warning(1, message); - return(NULL); - } - - switch (found) - { - case 24: return (&gotoblas_SKYLAKEX); - case 23: return (&gotoblas_ZEN); - case 22: return (&gotoblas_EXCAVATOR); - case 21: return (&gotoblas_STEAMROLLER); - case 20: return (&gotoblas_HASWELL); - case 19: return (&gotoblas_PILEDRIVER); - case 18: return (&gotoblas_BULLDOZER); - case 17: return (&gotoblas_BOBCAT); - case 16: return (&gotoblas_SANDYBRIDGE); - case 15: return (&gotoblas_NANO); - case 14: return (&gotoblas_BARCELONA); - case 13: return (&gotoblas_OPTERON); - case 12: return (&gotoblas_OPTERON_SSE3); - case 11: return (&gotoblas_ATHLON); - case 10: return (&gotoblas_NEHALEM); - case 9: return (&gotoblas_DUNNINGTON); - case 8: return (&gotoblas_PENRYN); - case 7: return (&gotoblas_CORE2); - case 6: return (&gotoblas_ATOM); - case 5: return (&gotoblas_BANIAS); - case 4: return (&gotoblas_PRESCOTT); - case 3: return (&gotoblas_NORTHWOOD); - case 2: return (&gotoblas_COPPERMINE); - case 1: return (&gotoblas_KATMAI); - } - return(NULL); - -} - - - - -void gotoblas_dynamic_init(void) { - - char coremsg[128]; - char coren[22]; - char *p; - - - if (gotoblas) return; - - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } - -#ifdef ARCH_X86 - if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; -#else - if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; - /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ - if (sizeof(void*) == 8) { - if (gotoblas == &gotoblas_KATMAI || - gotoblas == &gotoblas_COPPERMINE || - gotoblas == &gotoblas_NORTHWOOD || - gotoblas == &gotoblas_BANIAS || - gotoblas == &gotoblas_ATHLON) - gotoblas = &gotoblas_PRESCOTT; - } -#endif - - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } - -} - -void gotoblas_dynamic_quit(void) { - - gotoblas = NULL; - -} diff --git a/f_check b/f_check index 993ad9a35..79b24e2dc 100644 --- a/f_check +++ b/f_check @@ -71,7 +71,7 @@ if ($compiler eq "") { if ($data =~ /GNU/) { - $data =~ /(\d)\.(\d).(\d)/; + $data =~ /(\d+)\.(\d+).(\d+)/; $major = $1; $minor = $2; diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 7998c135a..6d96abb2e 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -1,4 +1,5 @@ USE_GEMM3M = 0 +OS := $(shell uname) ifeq ($(ARCH), x86) USE_GEMM3M = 1 @@ -59,8 +60,6 @@ USE_TRMM = 1 endif - - SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ @@ -438,7 +437,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s + m4 sgemmotcopy.s > sgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ + rm sgemmotcopy.s sgemmotcopy_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) @@ -446,12 +453,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) - $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ - +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s + m4 sgemmitcopy.s > sgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ + rm sgemmitcopy.s sgemmitcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s + m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ + rm dgemm_ncopy.s dgemm_ncopy_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -462,7 +483,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s + m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ + rm dgemm_itcopy.s dgemm_itcopy_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -498,7 +526,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s + m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ + rm cgemm_itcopy.s cgemm_itcopy_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -514,7 +549,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s + m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ + rm zgemm_itcopy.s zgemm_itcopy_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif endif @@ -539,37 +581,107 @@ endif endif $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s + m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s + m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s + m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ + rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s + m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ + rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s + m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s + m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ + rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s +else $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s + m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ + rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +endif $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s + m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ + rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +endif $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s + m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ + rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s + m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ + rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s +else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +endif $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -586,28 +698,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD ifdef USE_TRMM $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s + m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ + rm strmmkernel_ln.s strmmkernel_ln_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s + m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ + rm strmmkernel_lt.s strmmkernel_lt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s + m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ + rm strmmkernel_rn.s strmmkernel_rn_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s + m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ + rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s + m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ + rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s + m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ + rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s + m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ + rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -622,52 +790,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s + m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ + rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s + m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ + rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s + m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ + rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s + m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ + rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s + m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ + rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s + m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ + rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s + m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ + rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s + m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ + rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s + m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ + rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s + m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ + rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s + m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ + rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s + m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ + rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +endif $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s + m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ + rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s + m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ + rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +endif $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s + m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ + rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +endif $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s + m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ + rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +endif + else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -679,7 +960,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ @@ -806,7 +1094,14 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s + m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s + $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ + rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s +else $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ +endif $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ @@ -1942,7 +2237,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif -$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) +$(D cgemm_kernel_r_nomacros.s + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ + rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ @@ -2085,7 +2387,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) - $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s + m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s + $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ + rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s +else + $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +endif $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8 index efc1ec8bc..b90dd228b 100644 --- a/kernel/arm64/KERNEL.ARMV8 +++ b/kernel/arm64/KERNEL.ARMV8 @@ -102,6 +102,8 @@ CDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S DSDOTKERNEL = dot.S +DGEMM_BETA = dgemm_beta.S + SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S new file mode 100644 index 000000000..636954695 --- /dev/null +++ b/kernel/arm64/dgemm_beta.S @@ -0,0 +1,178 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 +#define N x1 +#define BETA d0 +#define LDC x6 +#define C00 x7 + +#define A01 x8 +#define A02 x9 +#define A03 x10 +#define A04 x11 + +#define beta0 d11 +#define betaV0 v11.d[0] +#define I x16 + +#define size 128 + +/************************************************************************************** +* Macro definitions +**************************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +/************************************************************************************** +* End of macro definitions +**************************************************************************************/ + + PROLOGUE + + .align 5 + + ldr LDC, [sp] + SAVE_REGS + +.Lgemm_beta_BEGIN: + + fmov beta0, BETA + cmp N, #0 + ble .Lgemm_beta_L999 + +.Lgemm_beta_01: + + lsl LDC, LDC, #3 + + .align 5 +.Lgemm_beta_02: + + mov A01, C00 + add C00, C00, LDC + asr I, M, #4 + cmp I, #0 + ble .Lgemm_beta_04 + add A02, A01, #32 + add A03, A02, #32 + add A04, A03, #32 + + .align 5 +.Lgemm_beta_03: + + ldp q0, q1, [A01] + ldp q2, q3, [A02] + ldp q4, q5, [A03] + ldp q6, q7, [A04] + + fmul v0.2d, v0.2d, betaV0 + fmul v1.2d, v1.2d, betaV0 + + fmul v2.2d, v2.2d, betaV0 + fmul v3.2d, v3.2d, betaV0 + + fmul v4.2d, v4.2d, betaV0 + fmul v5.2d, v5.2d, betaV0 + + fmul v6.2d, v6.2d, betaV0 + fmul v7.2d, v7.2d, betaV0 + + st1 {v0.2d, v1.2d}, [A01] + add A01, A01, size + st1 {v2.2d, v3.2d}, [A02] + add A02, A02, size + st1 {v4.2d, v5.2d}, [A03] + add A03, A03, size + st1 {v6.2d, v7.2d}, [A04] + add A04, A04, size + + subs I , I , #1 + bne .Lgemm_beta_03 + + .align 5 +.Lgemm_beta_04: + + and I, M , #15 // M%16 + cmp I, #0 + ble .Lgemm_beta_06 + + .align 5 +.Lgemm_beta_05: + + ldr d12, [A01] + fmul d12, d12, beta0 + str d12, [A01] + add A01, A01, #8 + + subs I , I , #1 + bne .Lgemm_beta_05 + + .align 5 +.Lgemm_beta_06: + + subs N , N, #1 // N-- + bne .Lgemm_beta_02 + + .align 5 +.Lgemm_beta_L999: + + mov x0, #0 + RESTORE_REGS + ret + + EPILOGUE diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c index 7d12c9885..91d53ffc3 100644 --- a/kernel/power/casum_microk_power8.c +++ b/kernel/power/casum_microk_power8.c @@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c index 613c4d286..6a7886e6f 100644 --- a/kernel/power/ccopy_microk_power8.c +++ b/kernel/power/ccopy_microk_power8.c @@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S index 9a18cb189..46108bbb4 100644 --- a/kernel/power/cgemm_macros_8x4_power8.S +++ b/kernel/power/cgemm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -107,9 +111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -172,9 +184,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -237,9 +257,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -302,9 +330,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -344,9 +380,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -409,9 +453,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -474,9 +526,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -1546,14 +1606,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -1575,9 +1643,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1622,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1669,9 +1753,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1716,9 +1808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1742,9 +1842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1789,9 +1897,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1836,9 +1952,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2388,14 +2512,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2416,9 +2548,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2454,9 +2594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2492,9 +2640,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2530,9 +2686,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2548,9 +2712,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2586,9 +2758,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2804,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -2916,14 +3104,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -2945,9 +3141,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -2992,9 +3196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3039,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3086,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3112,9 +3340,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3159,9 +3395,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3206,9 +3450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -3382,14 +3634,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -3406,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3446,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3486,9 +3762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3526,9 +3810,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3550,9 +3842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3590,9 +3890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3630,9 +3938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4170,14 +4486,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -4192,9 +4516,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4222,9 +4554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4252,9 +4592,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4282,9 +4630,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4298,9 +4654,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4328,9 +4692,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4358,9 +4730,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4638,14 +5018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4659,9 +5047,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4684,9 +5080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4709,9 +5113,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4734,9 +5146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4746,9 +5166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4771,9 +5199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4796,9 +5232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -4946,14 +5390,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -4968,9 +5420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -4998,9 +5458,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5028,9 +5496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5058,9 +5534,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5074,9 +5558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5104,9 +5596,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5134,9 +5634,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5226,14 +5734,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5247,9 +5763,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5275,9 +5799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5303,9 +5835,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5331,9 +5871,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5346,9 +5894,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5374,9 +5930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5402,9 +5966,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5676,14 +6248,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 lxvw4x vs1, o16, AO // load a2, a3 @@ -5695,9 +6275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5717,9 +6305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5739,9 +6335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5761,9 +6365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5772,9 +6384,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5794,9 +6414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5816,9 +6444,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -5960,14 +6596,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5978,9 +6622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5997,9 +6649,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6016,9 +6676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6035,18 +6703,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6063,9 +6747,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6082,9 +6774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6161,14 +6861,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6180,9 +6888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6202,9 +6918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6224,9 +6948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6246,9 +6978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6257,9 +6997,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6279,9 +7027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6301,9 +7057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO xxlxor vs24, vs24, vs24 @@ -6351,5 +7115,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S index 03fda2766..64bf8dd99 100644 --- a/kernel/power/cgemm_tcopy_macros_8_power8.S +++ b/kernel/power/cgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvw4x vs32, o0, A0 addi A0, A0, 16 @@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 959a9eda0..2a5835546 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "ble 2f \n\t" - ".p2align 5 \n\t" - "1: \n\t" + "ble two%= \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" @@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s) "addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t" - "bgt 1b \n\t" - "2: \n\t" + "bgt one%= \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" "xvmulsp 42, 34, 36 \n\t" diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c index 8d7d0c0b9..829800230 100644 --- a/kernel/power/cswap_microk_power8.c +++ b/kernel/power/cswap_microk_power8.c @@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S index 48a21252c..922cab57a 100644 --- a/kernel/power/ctrmm_macros_8x4_power8.S +++ b/kernel/power/ctrmm_macros_8x4_power8.S @@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -113,9 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -184,9 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -255,9 +275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -326,9 +354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -368,9 +404,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 // a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -439,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -510,9 +562,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 // a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1597,14 +1657,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1630,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1681,9 +1757,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -1732,9 +1816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1783,9 +1875,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -1809,9 +1909,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 // a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1860,9 +1968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -1911,9 +2027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 // a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -2470,14 +2594,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2501,9 +2633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2542,9 +2682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -2583,9 +2731,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2624,9 +2780,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -2642,9 +2806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 // a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2683,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -2724,9 +2904,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 // a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3019,14 +3207,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -3055,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3109,9 +3313,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -3163,9 +3375,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3217,9 +3437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -3243,9 +3471,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs22 // a4_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3297,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -3351,9 +3595,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs14 // a0_i*b3_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3526,14 +3778,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3556,9 +3816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3602,9 +3870,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -3648,9 +3924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3694,9 +3978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -3718,9 +4010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 // a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3764,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -3810,9 +4118,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 // a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4357,14 +4673,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4383,9 +4707,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4417,9 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4451,9 +4791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4485,9 +4833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4501,9 +4857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 // a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4535,9 +4899,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4569,9 +4941,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 // a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4852,14 +5232,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4876,9 +5264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4904,9 +5300,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -4932,9 +5336,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -4960,9 +5372,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -4972,9 +5392,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 // a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5000,9 +5428,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5028,9 +5464,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 // a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -5179,14 +5623,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -5205,9 +5657,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5239,9 +5699,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -5273,9 +5741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5307,9 +5783,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -5323,9 +5807,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs18 // a4_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5357,9 +5849,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -5391,9 +5891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs10 // a0_i*b1_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5482,14 +5990,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5514,9 +6030,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5553,9 +6077,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -5592,9 +6124,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5631,9 +6171,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -5646,9 +6194,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 // a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5685,9 +6241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -5724,9 +6288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 // a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -6001,14 +6573,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6029,9 +6609,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6060,9 +6648,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6091,9 +6687,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6122,9 +6726,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r @@ -6133,9 +6745,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 // a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6164,9 +6784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6195,9 +6823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 // a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -6340,14 +6976,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6366,9 +7010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6393,9 +7045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvw4x vs4, o0, AO // load a0, a1 @@ -6420,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6447,18 +7115,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddasp vs32, vs4, vs16 // a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r xvmaddasp vs33, vs4, vs17 // a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6483,9 +7167,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvw4x vs0, o0, AO // load a0, a1 @@ -6510,9 +7202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 // a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -6589,14 +7289,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO // load a0_r lxsspx vs1, o4, AO // load a0_i @@ -6610,9 +7318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6634,9 +7350,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO // load a0_r @@ -6658,9 +7382,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6682,9 +7414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 // a4_r*b0_r @@ -6693,9 +7433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs16 // a4_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6717,9 +7465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO // load a0_r @@ -6741,9 +7497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs8 // a0_i*b0_r +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -6790,5 +7554,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c index 880d7d271..4652fc57c 100644 --- a/kernel/power/dasum_microk_power8.c +++ b/kernel/power/dasum_microk_power8.c @@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c index fb714a3f9..a92026e83 100644 --- a/kernel/power/daxpy_microk_power8.c +++ b/kernel/power/daxpy_microk_power8.c @@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) __asm__ ( - "xxspltd %x4, %x22, 0 \n\t" + XXSPLTD_S(%x4,%x22,0) "dcbt 0, %2 \n\t" "dcbt 0, %3 \n\t" @@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" ".align 5 \n" - "1: \n\t" + "one%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" @@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha) "addi %3, %3, -64 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t" diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c index 261dc04de..b51a21d08 100644 --- a/kernel/power/dcopy_microk_power8.c +++ b/kernel/power/dcopy_microk_power8.c @@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c index 4e6bc29c9..d2518ef7e 100644 --- a/kernel/power/ddot_microk_power8.c +++ b/kernel/power/ddot_microk_power8.c @@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 33, 41, 49 \n\t" @@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S index 5be517f7c..782425fbd 100644 --- a/kernel/power/dgemm_macros_16x4_power8.S +++ b/kernel/power/dgemm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -58,10 +62,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -125,11 +137,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -194,9 +214,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -260,9 +288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L1', ` +#else .macro KERNEL4x16_L1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -326,9 +362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_L2', ` +#else .macro KERNEL4x16_L2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -392,10 +436,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -434,9 +486,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -495,9 +555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -555,9 +623,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif add T2, CO, LDC @@ -680,13 +756,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs39, o112, T4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -703,9 +787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -744,9 +836,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -824,9 +932,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -849,9 +965,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -887,9 +1011,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -925,9 +1057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -1035,13 +1175,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1054,9 +1202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1082,9 +1238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1110,9 +1274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1138,9 +1310,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1155,9 +1335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1183,9 +1371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1211,9 +1407,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1289,13 +1493,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1307,9 +1519,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1330,9 +1550,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1353,9 +1581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1376,9 +1612,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1389,9 +1633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1412,9 +1664,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1435,9 +1695,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1497,13 +1765,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1515,9 +1791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1538,9 +1822,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1561,9 +1853,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1584,9 +1884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1597,9 +1905,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1620,9 +1936,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1643,9 +1967,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1705,13 +2037,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1731,9 +2071,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1772,9 +2120,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1813,9 +2169,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1854,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1877,9 +2249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1918,9 +2298,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1959,9 +2347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2055,13 +2451,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2074,9 +2478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2100,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2126,9 +2546,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2152,9 +2580,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2167,9 +2603,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2193,9 +2637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2219,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2277,13 +2737,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2294,9 +2762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2314,9 +2790,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2334,9 +2818,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2354,9 +2846,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2365,9 +2865,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2385,9 +2893,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2405,9 +2921,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2447,13 +2971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2463,9 +2995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2480,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2497,9 +3045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2514,18 +3070,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2540,9 +3112,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2557,9 +3137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2591,13 +3179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2607,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2624,9 +3228,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2641,9 +3253,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2658,18 +3278,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2684,9 +3320,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2701,9 +3345,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2735,13 +3387,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2760,9 +3420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2791,9 +3459,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2822,9 +3498,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2853,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2867,9 +3559,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2898,9 +3598,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2929,9 +3637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2980,13 +3696,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2998,9 +3722,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3018,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3038,9 +3778,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3058,9 +3806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3068,9 +3824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3088,9 +3852,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3108,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3140,13 +3920,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3156,9 +3944,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3172,9 +3968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3188,9 +3992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3204,17 +4016,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3228,9 +4056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3244,9 +4080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3268,13 +4112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3283,9 +4135,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3297,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3311,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3325,16 +4201,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3346,9 +4238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3360,9 +4260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3380,13 +4288,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3395,9 +4311,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3409,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3423,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3437,16 +4377,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3458,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3472,9 +4436,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3492,5 +4464,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S index 8d6744b91..33d02c77d 100644 --- a/kernel/power/dgemm_ncopy_macros_4_power8.S +++ b/kernel/power/dgemm_ncopy_macros_4_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o0, A1 @@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs0, o0, A0 lxvd2x vs1, o16, A0 @@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs0, o0, A0 addi A0, A0, 16 @@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs0, o0, A0 addi A0, A0, 8 @@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S index 68e53bcf2..6c5b8ed62 100644 --- a/kernel/power/dgemm_tcopy_macros_16_power8.S +++ b/kernel/power/dgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs10, o32, T1 stxvd2x vs11, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs35, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsdx vs32, o0, A0 addi A0, A0, 8 @@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c index ae4fe9009..c2eb3968c 100644 --- a/kernel/power/dgemv_n_microk_power8.c +++ b/kernel/power/dgemv_n_microk_power8.c @@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y ( "lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 35, %11, %10 \n\t" // x2, x3 - "xxspltd 32, %x9, 0 \n\t" // alpha, alpha + XXSPLTD_S(32,%x9,0) // alpha, alpha "sldi %6, %13, 3 \n\t" // lda * sizeof (double) @@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %6, %6, %6 \n\t" // 2 * lda - "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha - "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha - "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha - "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha + XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha + XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha + XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha + XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda @@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %6, %6, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 @@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 @@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y "addi %2, %2, 32 \n\t" "addic. %1, %1, -4 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 37, %11, %2 \n\t" // y2, y3 diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index d05d7b7d3..5d43f673f 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "li %[off],32 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" //-------------------------------------------------- - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" "addi %[off2], %[off2],32 \n\t" @@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "xvmaddadp 34,36,32 \n\t" "xvmaddadp 35,38,32 \n\t" #if defined(PREFETCH) @@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "lxvd2x 33, %[x], %[off2] \n\t" "addic. %[n],%[n],-4 \n\t" - "ble- 2f \n\t" + "ble- two%= \n\t" "addi %[off2], %[off2],32 \n\t" #if defined(PREFETCH) @@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do #if defined(PREFETCH) "dcbt %[temp],%[x] \n\t" #endif - "bgt+ 1b \n\t" - ".p2align 5 \n\t" - "2: \n\t" + "bgt+ one%= \n\t" + ".align 5 \n\t" + "two%=: \n\t" //-------------------------------------------- "xvmaddadp 34,36,32 \n\t" @@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do "xvmaddadp 7,46,32 \n\t" "xvmaddadp 8,48,32 \n\t" "xvmaddadp 9,50,32 \n\t" - "xxspltd 36, %x[alpha], 0 \n\t" + XXSPLTD_S(36,%x[alpha],0) "xvmaddadp 34,37,33 \n\t" "xvmaddadp 35,39,33 \n\t" "xvmaddadp 4,41,33 \n\t" @@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do - "xxmrgld 42,34,35 \n\t" - "xxmrghd 43,34,35 \n\t" + XXMRGLD_S(42,34,35) + XXMRGHD_S(43,34,35) - "xxmrgld 44,4,5 \n\t" - "xxmrghd 45,4,5 \n\t" + XXMRGLD_S(44,4,5) + XXMRGHD_S(45,4,5) "xvadddp 42,42,43 \n\t" - "xxmrgld 46,6,7 \n\t" - "xxmrghd 47,6,7 \n\t" + XXMRGLD_S(46,6,7) + XXMRGHD_S(47,6,7) "xvadddp 44,44,45 \n\t" - "xxmrgld 48,8,9 \n\t" - "xxmrghd 49,8,9 \n\t" + XXMRGLD_S(48,8,9) + XXMRGHD_S(49,8,9) "xvadddp 46,46,47 \n\t" diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c index 016b7764d..259c08187 100644 --- a/kernel/power/drot_microk_power8.c +++ b/kernel/power/drot_microk_power8.c @@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) __asm__ ( - "xxspltd 36, %x13, 0 \n\t" // load c to both dwords - "xxspltd 37, %x14, 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x13,0) // load c to both dwords + XXSPLTD_S(37,%x14,0) // load s to both dwords "lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 33, %15, %3 \n\t" @@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c index 04898eb3d..e9bacd05a 100644 --- a/kernel/power/dscal_microk_power8.c +++ b/kernel/power/dscal_microk_power8.c @@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha) ( "dcbt 0, %2 \n\t" - "xxspltd %x3, %x3, 0 \n\t" + XXSPLTD_S(%x3,%x3,0) "lxvd2x 32, 0, %2 \n\t" "lxvd2x 33, %4, %2 \n\t" @@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t" @@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c index 31eff3449..ecfd5c9f9 100644 --- a/kernel/power/dswap_microk_power8.c +++ b/kernel/power/dswap_microk_power8.c @@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S index 079144a90..efb034594 100644 --- a/kernel/power/dtrmm_macros_16x4_power8.S +++ b/kernel/power/dtrmm_macros_16x4_power8.S @@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -60,9 +64,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -127,9 +139,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -195,9 +215,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -262,9 +290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -303,9 +339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs14, vs31 xvmaddadp vs63, vs15, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -364,9 +408,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs62, vs6, vs27 xvmuldp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -425,9 +477,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs62, vs6, vs27 xvmaddadp vs63, vs7, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO addi T2, T1, 64 @@ -615,13 +675,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -638,9 +706,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 @@ -679,9 +755,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 @@ -719,9 +803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 @@ -759,9 +851,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs10, vs31 xvmaddadp vs59, vs11, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -822,9 +930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs58, vs2, vs27 xvmuldp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -860,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs58, vs2, vs27 xvmaddadp vs59, vs3, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -970,13 +1094,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -989,9 +1121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1017,9 +1157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1045,9 +1193,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1073,9 +1229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1090,9 +1254,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 xvmaddadp vs57, vs9, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1118,9 +1290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 xvmuldp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1146,9 +1326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 xvmaddadp vs57, vs1, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -1224,13 +1412,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxvd2x vs0, 0, AO @@ -1242,9 +1438,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxvd2x vs8, 0, AO @@ -1265,9 +1469,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxvd2x vs8, 0, AO @@ -1288,9 +1500,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxvd2x vs0, 0, AO @@ -1311,9 +1531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1324,9 +1552,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -1347,9 +1583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -1370,9 +1614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -1432,13 +1684,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsdx vs0, 0, AO @@ -1450,9 +1710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsdx vs8, 0, AO @@ -1473,9 +1741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsdx vs8, 0, AO @@ -1496,9 +1772,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsdx vs0, 0, AO @@ -1519,9 +1803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs8, vs28 @@ -1532,9 +1824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs8, vs31 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -1555,9 +1855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -1578,9 +1886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs56, vs0, vs27 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -1640,13 +1956,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1666,9 +1990,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1707,9 +2039,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -1748,9 +2088,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1789,9 +2137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -1812,9 +2168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs14, vs29 xvmaddadp vs47, vs15, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1853,9 +2217,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs46, vs6, vs25 xvmuldp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -1894,9 +2266,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs46, vs6, vs25 xvmaddadp vs47, vs7, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO addi T2, T1, 64 @@ -1990,13 +2370,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2009,9 +2397,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2035,9 +2431,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2061,9 +2465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2087,9 +2499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2102,9 +2522,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs10, vs29 xvmaddadp vs43, vs11, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2128,9 +2556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs42, vs2, vs25 xvmuldp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2154,9 +2590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs42, vs2, vs25 xvmaddadp vs43, vs3, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -2212,13 +2656,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2229,9 +2681,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2249,9 +2709,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2269,9 +2737,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2289,9 +2765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2300,9 +2784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 xvmaddadp vs41, vs9, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2320,9 +2812,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 xvmuldp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2340,9 +2840,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 xvmaddadp vs41, vs1, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -2382,13 +2890,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs0, 0, AO @@ -2398,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, 0, AO @@ -2415,9 +2939,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, 0, AO @@ -2432,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, 0, AO @@ -2449,18 +2989,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -2475,9 +3031,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -2492,9 +3056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -2526,13 +3098,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=2, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsdx vs0, 0, AO @@ -2542,9 +3122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsdx vs8, 0, AO @@ -2559,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsdx vs8, 0, AO @@ -2576,9 +3172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsdx vs0, 0, AO @@ -2593,18 +3197,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs8, vs28 xsmaddadp vs40, vs8, vs29 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -2619,9 +3239,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -2636,9 +3264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs40, vs0, vs25 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -2670,13 +3306,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=16 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2695,9 +3339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2726,9 +3378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2757,9 +3417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2788,9 +3456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -2802,9 +3478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs14, vs28 xvmaddadp vs39, vs15, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2833,9 +3517,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs38, vs6, vs24 xvmuldp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2864,9 +3556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs38, vs6, vs24 xvmaddadp vs39, vs7, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO addi T2, T1, 64 @@ -2915,13 +3615,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=4, M=8 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2933,9 +3641,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2953,9 +3669,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -2973,9 +3697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -2993,9 +3725,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs28 @@ -3003,9 +3743,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs10, vs28 xvmaddadp vs35, vs11, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3023,9 +3771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs34, vs2, vs24 xvmuldp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3043,9 +3799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs34, vs2, vs24 xvmaddadp vs35, vs3, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -3075,13 +3839,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=4 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3091,9 +3863,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3107,9 +3887,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, 0, AO lxvd2x vs9, o16, AO @@ -3123,9 +3911,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3139,17 +3935,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs28 xvmaddadp vs33, vs9, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3163,9 +3975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 xvmuldp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, 0, AO lxvd2x vs1, o16, AO @@ -3179,9 +3999,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 xvmaddadp vs33, vs1, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -3203,13 +4031,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=2 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs0, 0, AO @@ -3218,9 +4054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, 0, AO @@ -3232,9 +4076,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, 0, AO @@ -3246,9 +4098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, 0, AO @@ -3260,16 +4120,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, 0, AO @@ -3281,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, 0, AO @@ -3295,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -3315,13 +4207,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************* * Macros for N=1, M=1 * *********************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsdx vs0, 0, AO @@ -3330,9 +4230,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 8 addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsdx vs8, 0, AO @@ -3344,9 +4252,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsdx vs8, 0, AO @@ -3358,9 +4274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsdx vs0, 0, AO @@ -3372,16 +4296,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs8, vs28 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsdx vs0, 0, AO @@ -3393,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsdx vs0, 0, AO @@ -3407,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs24 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3427,5 +4383,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S index dc47daa3a..5a5c4037c 100644 --- a/kernel/power/dtrsm_macros_LT_16x4_power8.S +++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S @@ -1,46 +1,58 @@ +#if defined(_AIX) +define(`INIT_16x4', ` +#else .macro INIT_16x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 - xvmovdp vs48, vs0 - xvmovdp vs49, vs0 - xvmovdp vs50, vs0 - xvmovdp vs51, vs0 - xvmovdp vs52, vs0 - xvmovdp vs53, vs0 - xvmovdp vs54, vs0 - xvmovdp vs55, vs0 - xvmovdp vs56, vs0 - xvmovdp vs57, vs0 - xvmovdp vs58, vs0 - xvmovdp vs59, vs0 - xvmovdp vs60, vs0 - xvmovdp vs61, vs0 - xvmovdp vs62, vs0 - xvmovdp vs63, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) + XVMOVDP(vs48,vs0) + XVMOVDP(vs49,vs0) + XVMOVDP(vs50,vs0) + XVMOVDP(vs51,vs0) + XVMOVDP(vs52,vs0) + XVMOVDP(vs53,vs0) + XVMOVDP(vs54,vs0) + XVMOVDP(vs55,vs0) + XVMOVDP(vs56,vs0) + XVMOVDP(vs57,vs0) + XVMOVDP(vs58,vs0) + XVMOVDP(vs59,vs0) + XVMOVDP(vs60,vs0) + XVMOVDP(vs61,vs0) + XVMOVDP(vs62,vs0) + XVMOVDP(vs63,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x4', ` +#else .macro KERNEL_16x4 +#endif lxvd2x vs0, o0, AO @@ -98,35 +110,51 @@ xvmaddadp vs63, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x4', ` +#else .macro INIT_8x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x4', ` +#else .macro KERNEL_8x4 +#endif lxvd2x vs0, o0, AO @@ -161,27 +189,43 @@ xvmaddadp vs47, vs3, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x4', ` +#else .macro INIT_4x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x4', ` +#else .macro KERNEL_4x4 +#endif lxvd2x vs0, o0, AO @@ -206,23 +250,39 @@ xvmaddadp vs39, vs1, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x4', ` +#else .macro INIT_2x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x4', ` +#else .macro KERNEL_2x4 +#endif lxvd2x vs0, o0, AO @@ -242,23 +302,39 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x4', ` +#else .macro INIT_1x4 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x4', ` +#else .macro KERNEL_1x4 +#endif lxvdsx vs0, o0, AO @@ -278,14 +354,22 @@ xvmaddadp vs35, vs0, vs19 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x4', ` +#else .macro SOLVE_LT_16x4 +#endif //############### LOAD B ####################### @@ -1149,46 +1233,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) addi T1, T1, 32 stxsdx vs48, o0, T1 - xxswapd vs48, vs48 + XXSWAPD(vs48,vs48) stxsdx vs50, o8, T1 - xxswapd vs50, vs50 + XXSWAPD(vs50,vs50) stxsdx vs52, o16, T1 - xxswapd vs52, vs52 + XXSWAPD(vs52,vs52) stxsdx vs54, o24, T1 - xxswapd vs54, vs54 + XXSWAPD(vs54,vs54) addi T1, T1, 32 stxsdx vs56, o0, T1 - xxswapd vs56, vs56 + XXSWAPD(vs56,vs56) stxsdx vs58, o8, T1 - xxswapd vs58, vs58 + XXSWAPD(vs58,vs58) stxsdx vs60, o16, T1 - xxswapd vs60, vs60 + XXSWAPD(vs60,vs60) stxsdx vs62, o24, T1 - xxswapd vs62, vs62 + XXSWAPD(vs62,vs62) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1225,46 +1309,46 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) addi T1, T1, 32 stxsdx vs49, o0, T1 - xxswapd vs49, vs49 + XXSWAPD(vs49,vs49) stxsdx vs51, o8, T1 - xxswapd vs51, vs51 + XXSWAPD(vs51,vs51) stxsdx vs53, o16, T1 - xxswapd vs53, vs53 + XXSWAPD(vs53,vs53) stxsdx vs55, o24, T1 - xxswapd vs55, vs55 + XXSWAPD(vs55,vs55) addi T1, T1, 32 stxsdx vs57, o0, T1 - xxswapd vs57, vs57 + XXSWAPD(vs57,vs57) stxsdx vs59, o8, T1 - xxswapd vs59, vs59 + XXSWAPD(vs59,vs59) stxsdx vs61, o16, T1 - xxswapd vs61, vs61 + XXSWAPD(vs61,vs61) stxsdx vs63, o24, T1 - xxswapd vs63, vs63 + XXSWAPD(vs63,vs63) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1292,14 +1376,22 @@ stxsdx vs61, o16, T2 stxsdx vs63, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x4', ` +#else .macro SOLVE_LT_8x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1603,24 +1695,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs42, o8, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs44, o16, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs46, o24, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1643,24 +1735,24 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs41, o0, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs43, o8, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) stxsdx vs45, o16, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 @@ -1674,14 +1766,22 @@ stxsdx vs45, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x4', ` +#else .macro SOLVE_LT_4x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1813,13 +1913,13 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs36, o16, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs38, o24, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1835,27 +1935,35 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs37, o16, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 stxsdx vs37, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x4', ` +#else .macro SOLVE_LT_2x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -1925,9 +2033,9 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs34, o8, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs32, o0, T2 stxsdx vs34, o8, T2 @@ -1941,21 +2049,29 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs35, o8, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs33, o0, T2 stxsdx vs35, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x4 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x4', ` +#else .macro SOLVE_LT_1x4 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs34, vs35, 0 @@ -2001,7 +2117,7 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 @@ -2014,39 +2130,55 @@ stxsdx vs33, o0, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs33, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x2', ` +#else .macro INIT_16x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x2', ` +#else .macro KERNEL_16x2 +#endif lxvd2x vs0, o0, AO @@ -2086,27 +2218,43 @@ xvmaddadp vs47, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x2', ` +#else .macro INIT_8x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x2', ` +#else .macro KERNEL_8x2 +#endif lxvd2x vs0, o0, AO @@ -2131,23 +2279,39 @@ xvmaddadp vs39, vs3, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x2', ` +#else .macro INIT_4x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x2', ` +#else .macro KERNEL_4x2 +#endif lxvd2x vs0, o0, AO @@ -2166,21 +2330,37 @@ xvmaddadp vs35, vs1, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x2', ` +#else .macro INIT_2x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x2', ` +#else .macro KERNEL_2x2 +#endif lxvd2x vs0, o0, AO @@ -2196,21 +2376,37 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x2', ` +#else .macro INIT_1x2 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x2', ` +#else .macro KERNEL_1x2 +#endif lxvdsx vs0, o0, AO @@ -2226,14 +2422,22 @@ xvmaddadp vs33, vs0, vs17 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x2', ` +#else .macro SOLVE_LT_16x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -2821,46 +3025,46 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) addi T1, T1, 32 stxsdx vs40, o0, T1 - xxswapd vs40, vs40 + XXSWAPD(vs40,vs40) stxsdx vs41, o8, T1 - xxswapd vs41, vs41 + XXSWAPD(vs41,vs41) stxsdx vs42, o16, T1 - xxswapd vs42, vs42 + XXSWAPD(vs42,vs42) stxsdx vs43, o24, T1 - xxswapd vs43, vs43 + XXSWAPD(vs43,vs43) addi T1, T1, 32 stxsdx vs44, o0, T1 - xxswapd vs44, vs44 + XXSWAPD(vs44,vs44) stxsdx vs45, o8, T1 - xxswapd vs45, vs45 + XXSWAPD(vs45,vs45) stxsdx vs46, o16, T1 - xxswapd vs46, vs46 + XXSWAPD(vs46,vs46) stxsdx vs47, o24, T1 - xxswapd vs47, vs47 + XXSWAPD(vs47,vs47) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -2888,14 +3092,22 @@ stxsdx vs46, o16, T2 stxsdx vs47, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x2', ` +#else .macro SOLVE_LT_8x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3111,24 +3323,24 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) addi T1, T1, 32 stxsdx vs36, o0, T1 - xxswapd vs36, vs36 + XXSWAPD(vs36,vs36) stxsdx vs37, o8, T1 - xxswapd vs37, vs37 + XXSWAPD(vs37,vs37) stxsdx vs38, o16, T1 - xxswapd vs38, vs38 + XXSWAPD(vs38,vs38) stxsdx vs39, o24, T1 - xxswapd vs39, vs39 + XXSWAPD(vs39,vs39) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 @@ -3142,14 +3354,22 @@ stxsdx vs38, o16, T2 stxsdx vs39, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x2', ` +#else .macro SOLVE_LT_4x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3245,27 +3465,35 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs34, o16, T1 - xxswapd vs34, vs34 + XXSWAPD(vs34,vs34) stxsdx vs35, o24, T1 - xxswapd vs35, vs35 + XXSWAPD(vs35,vs35) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 stxsdx vs34, o16, T2 stxsdx vs35, o24, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x2', ` +#else .macro SOLVE_LT_2x2 +#endif xxpermdi vs0, vs32, vs33, 0 xxpermdi vs1, vs32, vs33, 3 @@ -3322,21 +3550,29 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs33, o8, T1 - xxswapd vs33, vs33 + XXSWAPD(vs33,vs33) stxsdx vs32, o0, T2 stxsdx vs33, o8, T2 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x2 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x2', ` +#else .macro SOLVE_LT_1x2 +#endif xxpermdi vs0, vs32, vs33, 0 @@ -3376,39 +3612,55 @@ stxsdx vs32, o0, T1 - xxswapd vs32, vs32 + XXSWAPD(vs32,vs32) stxsdx vs32, o0, T2 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_16x1', ` +#else .macro INIT_16x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 - xvmovdp vs40, vs0 - xvmovdp vs41, vs0 - xvmovdp vs42, vs0 - xvmovdp vs43, vs0 - xvmovdp vs44, vs0 - xvmovdp vs45, vs0 - xvmovdp vs46, vs0 - xvmovdp vs47, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) + XVMOVDP(vs40,vs0) + XVMOVDP(vs41,vs0) + XVMOVDP(vs42,vs0) + XVMOVDP(vs43,vs0) + XVMOVDP(vs44,vs0) + XVMOVDP(vs45,vs0) + XVMOVDP(vs46,vs0) + XVMOVDP(vs47,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_16x1', ` +#else .macro KERNEL_16x1 +#endif lxvdsx vs0, o0, AO @@ -3461,27 +3713,43 @@ xvmaddadp vs47, vs15, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_8x1', ` +#else .macro INIT_8x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 - xvmovdp vs36, vs0 - xvmovdp vs37, vs0 - xvmovdp vs38, vs0 - xvmovdp vs39, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) + XVMOVDP(vs36,vs0) + XVMOVDP(vs37,vs0) + XVMOVDP(vs38,vs0) + XVMOVDP(vs39,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_8x1', ` +#else .macro KERNEL_8x1 +#endif lxvdsx vs0, o0, AO @@ -3512,23 +3780,39 @@ xvmaddadp vs39, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_4x1', ` +#else .macro INIT_4x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 - xvmovdp vs34, vs0 - xvmovdp vs35, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) + XVMOVDP(vs34,vs0) + XVMOVDP(vs35,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_4x1', ` +#else .macro KERNEL_4x1 +#endif lxvdsx vs0, o0, AO @@ -3548,21 +3832,37 @@ xvmaddadp vs35, vs3, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_2x1', ` +#else .macro INIT_2x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 - xvmovdp vs33, vs0 + XVMOVDP(vs32,vs0) + XVMOVDP(vs33,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_2x1', ` +#else .macro KERNEL_2x1 +#endif lxvdsx vs0, o0, AO @@ -3578,20 +3878,36 @@ xvmaddadp vs33, vs1, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`INIT_1x1', ` +#else .macro INIT_1x1 +#endif xxlxor vs0, vs0, vs0 - xvmovdp vs32, vs0 + XVMOVDP(vs32,vs0) +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL_1x1', ` +#else .macro KERNEL_1x1 +#endif lxvdsx vs0, o0, AO @@ -3605,31 +3921,39 @@ xvmaddadp vs32, vs0, vs16 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 16x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_16x1', ` +#else .macro SOLVE_LT_16x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 - xxswapd vs8, vs40 - xxswapd vs9, vs41 - xxswapd vs10, vs42 - xxswapd vs11, vs43 - xxswapd vs12, vs44 - xxswapd vs13, vs45 - xxswapd vs14, vs46 - xxswapd vs15, vs47 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) + XXSWAPD(vs8,vs40) + XXSWAPD(vs9,vs41) + XXSWAPD(vs10,vs42) + XXSWAPD(vs11,vs43) + XXSWAPD(vs12,vs44) + XXSWAPD(vs13,vs45) + XXSWAPD(vs14,vs46) + XXSWAPD(vs15,vs47) //############### LOAD B ####################### @@ -4215,23 +4539,31 @@ stxsdx vs46, o16, T1 stxsdx vs47, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 8x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_8x1', ` +#else .macro SOLVE_LT_8x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 - xxswapd vs4, vs36 - xxswapd vs5, vs37 - xxswapd vs6, vs38 - xxswapd vs7, vs39 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) + XXSWAPD(vs4,vs36) + XXSWAPD(vs5,vs37) + XXSWAPD(vs6,vs38) + XXSWAPD(vs7,vs39) //############### LOAD B ####################### @@ -4443,19 +4775,27 @@ stxsdx vs38, o16, T1 stxsdx vs39, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 4x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_4x1', ` +#else .macro SOLVE_LT_4x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 - xxswapd vs2, vs34 - xxswapd vs3, vs35 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) + XXSWAPD(vs2,vs34) + XXSWAPD(vs3,vs35) //############### LOAD B ####################### @@ -4546,17 +4886,25 @@ stxsdx vs34, o16, T1 stxsdx vs35, o24, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 2x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_2x1', ` +#else .macro SOLVE_LT_2x1 +#endif - xxswapd vs0, vs32 - xxswapd vs1, vs33 + XXSWAPD(vs0,vs32) + XXSWAPD(vs1,vs33) //############### LOAD B ####################### @@ -4609,16 +4957,24 @@ stxsdx vs32, o0, T1 stxsdx vs33, o8, T1 +#if defined(_AIX) +') +#else .endm +#endif /*########################################################################################## SOLVE_LT 1x1 ##########################################################################################*/ +#if defined(_AIX) +define(`SOLVE_LT_1x1', ` +#else .macro SOLVE_LT_1x1 +#endif - xxswapd vs0, vs32 + XXSWAPD(vs0,vs32) //############### LOAD B ####################### @@ -4655,5 +5011,9 @@ stxsdx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c index 95aa592c7..195a8c68e 100644 --- a/kernel/power/idamax.c +++ b/kernel/power/idamax.c @@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 44, 44 \n\t" "xvabsdp 45, 45 \n\t" @@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) "xvcmpgtdp 2, 3,39 \n\t" @@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 1,1,5 \n\t" // get real index for first bigger - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 4,49,48 \n\t " - "xvcmpgtdp 5,51,50 \n\t" + "xvcmpgtdp 5,7,6 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 3,47, 45 \n\t" @@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c index 323f9987e..8a5538821 100644 --- a/kernel/power/idamin.c +++ b/kernel/power/idamin.c @@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 @@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) "xvabsdp 39, 39 \n\t" "xvabsdp 44, 44 \n\t" @@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" //=================================================================== - ".p2align 5 \n\t" + ".align 5 \n\t" - "1: \n\t" + "one%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) "xvcmpgtdp 2,39, 3 \n\t" @@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //<-----------jump here from first load - "2: \n\t" + "two%=: \n\t" "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 1,1,5 \n\t" // get real index for first smaller - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" @@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -32 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" //============================================================================== "xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 4,48,49 \n\t " - "xvcmpgtdp 5,50,51 \n\t" + "xvcmpgtdp 5,6,7 \n\t" "xxsel 32,40,41,2 \n\t" "xxsel 0,44,45,2 \n\t" @@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { "xxsel 34,40,41,4 \n\t" "xxsel 45,48,49,4 \n\t" "xxsel 35,42,43,5 \n\t" - "xxsel 47,50,51,5 \n\t" + "xxsel 47,6,7,5 \n\t" "xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 3, 45,47 \n\t" @@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c index 3c132f81a..7149da28b 100644 --- a/kernel/power/izamax.c +++ b/kernel/power/izamax.c @@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { - "xvcmpgtdp 50,47,46 \n\t " - "xvcmpgtdp 51,49,48 \n\t " + "xvcmpgtdp 6,47,46 \n\t " + "xvcmpgtdp 7,49,48 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,1,0 \n\t " "xxsel 32,32,33,2 \n\t" @@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { ///////extract max value and max index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4, 40,39 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c index 06a5537d8..692315b89 100644 --- a/kernel/power/izamin.c +++ b/kernel/power/izamin.c @@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 @@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 - "xxspltd 36,36,0 \n\t" + XXSPLTD_S(36,36,0) @@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //jump first half forward - "b 2f \n\t" + "b two%= \n\t" - ".p2align 5 \n\t" - "1: \n\t" + ".align 5 \n\t" + "one%=: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //>>/////////////////////////////// half start - "2: \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + "two%=: \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" "xvadddp 48, 4,5 \n\t" "xvadddp 49, 44,45 \n\t" - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t" @@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t" - "lxvd2x 50, %[i96],%[ptr_tmp] \n\t" - "lxvd2x 51,%[i112],%[ptr_tmp] \n\t" + "lxvd2x 6, %[i96],%[ptr_tmp] \n\t" + "lxvd2x 7,%[i112],%[ptr_tmp] \n\t" //select with previous "xxsel 38,38,32,4 \n\t" "xxsel 39,39,3,4 \n\t" @@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { "xvabsdp 47, 47 \n\t" "xvabsdp 48, 48 \n\t" "xvabsdp 49, 49 \n\t" - "xvabsdp 50, 50 \n\t" - "xvabsdp 51, 51 \n\t" + "xvabsdp 6, 6 \n\t" + "xvabsdp 7, 7 \n\t" //decrement n "addic. %[n], %[n], -16 \n\t" //Loop back if >0 - "bgt+ 1b \n\t" + "bgt+ one%= \n\t" - "xxmrghd 0,44,45 \n\t" - "xxmrgld 1,44,45 \n\t" - "xxmrghd 2,46,47 \n\t" - "xxmrgld 3,46,47 \n\t" - "xxmrghd 4,48,49 \n\t" - "xxmrgld 5,48,49 \n\t" - "xxmrghd 44,50,51 \n\t" - "xxmrgld 45,50,51 \n\t" + XXMRGHD_S(0,44,45) + XXMRGLD_S(1,44,45) + XXMRGHD_S(2,46,47) + XXMRGLD_S(3,46,47) + XXMRGHD_S(4,48,49) + XXMRGLD_S(5,48,49) + XXMRGHD_S(44,6,7) + XXMRGLD_S(45,6,7) "xvadddp 46, 0,1 \n\t" "xvadddp 47, 2,3 \n\t" @@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - "xvcmpgtdp 50,46,47 \n\t " - "xvcmpgtdp 51,48,49 \n\t " + "xvcmpgtdp 6,46,47 \n\t " + "xvcmpgtdp 7,48,49 \n\t " - "xxsel 32,40,41,50 \n\t" - "xxsel 0,46,47,50 \n\t" - "xxsel 33,42,43,51 \n\t" - "xxsel 1,48,49,51 \n\t" + "xxsel 32,40,41,6 \n\t" + "xxsel 0,46,47,6 \n\t" + "xxsel 33,42,43,7 \n\t" + "xxsel 1,48,49,7 \n\t" "xvcmpgtdp 2,0,1 \n\t " "xxsel 32,32,33,2 \n\t" @@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { ///////extract min value and min index from vector - "xxspltd 32,38,1 \n\t" - "xxspltd 40,39,1 \n\t" + XXSPLTD_S(32,38,1) + XXSPLTD_S(40,39,1) "xvcmpeqdp. 2, 40,39 \n\t" //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //0b001110=14 - "bc 14,24, 3f \n\t" + "bc 14,24, three%= \n\t" "xvcmpgtdp 4,39, 40 \n\t" "xxsel 0,39,40,4 \n\t" "xxsel 1,38,32,4 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "b 4f \n\t" + "b four%= \n\t" - "3: \n\t" + "three%=: \n\t" //if elements value are equal then choose minimum index - "xxspltd 0,40,0 \n\t" + XXSPLTD_S(0,40,0) "vminud 0,0,6 \n\t" //vs32 vs38 "xxlor 1,32,32 \n\t" "stxsdx 0,0,%[ptr_minf] \n\t" - "4: \n\t" + "four%=: \n\t" "mfvsrd %[index],1 \n\t" : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) @@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [start] "v"(start), [adder] "v"(temp_add_index) : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", - "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" + "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7" ); return index; diff --git a/kernel/power/lock.c b/kernel/power/lock.c index 51348d63c..1c1b006b0 100644 --- a/kernel/power/lock.c +++ b/kernel/power/lock.c @@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ " .machine \"any\" ;" "0: lwarx %0,0, %1 ;" " cmpwi 0,%0,0;" - " bne 1f;" + " bne one%=;" " stwcx. %2,0, %1 ;" " bne- 0b;" - "1: " + "one%=: " : "=&r"(ret) : "r"(address), "r" (val) : "cr0", "memory"); diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c index 4bb515de8..aa465c38e 100644 --- a/kernel/power/sasum_microk_power8.c +++ b/kernel/power/sasum_microk_power8.c @@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" @@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x) "xvaddsp 38, 38, %x5 \n\t" "xvaddsp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabssp 48, 40 \n\t" "xvabssp 49, 41 \n\t" diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c index 7a54d5e1e..da39789b1 100644 --- a/kernel/power/scopy_microk_power8.c +++ b/kernel/power/scopy_microk_power8.c @@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" @@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 40, 0, %3 \n\t" "stxvd2x 41, %5, %3 \n\t" diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c index bfe100c8b..a8db6a8d6 100644 --- a/kernel/power/sdot_microk_power8.c +++ b/kernel/power/sdot_microk_power8.c @@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "lxvd2x 40, 0, %2 \n\t" @@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y) "addi %3, %3, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 33, 41, 49 \n\t" diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S index 98414857f..9bcfca827 100644 --- a/kernel/power/sgemm_macros_16x8_power8.S +++ b/kernel/power/sgemm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1633,9 +1829,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1686,9 +1890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1739,9 +1951,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1792,9 +2012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1822,9 +2050,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1875,9 +2111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1928,9 +2172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2103,14 +2355,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2133,9 +2393,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 128 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2177,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2221,9 +2497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2265,9 +2549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2287,9 +2579,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2331,9 +2631,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2375,9 +2683,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2518,14 +2834,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2543,9 +2867,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2586,9 +2918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2629,9 +2969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2672,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2698,9 +3054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2741,9 +3105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2784,9 +3156,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2907,14 +3287,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2930,9 +3318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2963,9 +3359,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2996,9 +3400,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3029,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3047,9 +3467,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3080,9 +3508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3113,9 +3549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3204,14 +3648,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3226,9 +3678,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3254,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3282,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3310,9 +3786,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3324,9 +3808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3352,9 +3844,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3380,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3455,14 +3963,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3479,9 +3995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3513,9 +4037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3547,9 +4079,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3581,9 +4121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3599,9 +4147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3633,9 +4189,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3667,9 +4231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3758,14 +4330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3781,9 +4361,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3810,9 +4398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3839,9 +4435,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3868,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3882,9 +4494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3911,9 +4531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3940,9 +4568,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -4015,14 +4651,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4038,9 +4682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4069,9 +4721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4100,9 +4760,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4131,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4147,9 +4823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4178,9 +4862,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4209,9 +4901,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4274,14 +4974,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4295,9 +5003,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4320,9 +5036,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4345,9 +5069,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4370,9 +5102,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4382,9 +5122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4407,9 +5155,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4432,9 +5188,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4481,14 +5245,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4501,9 +5273,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4523,9 +5303,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4545,9 +5333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4567,9 +5363,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4577,9 +5381,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4599,9 +5411,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4621,9 +5441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4662,14 +5490,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4684,9 +5520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4710,9 +5554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4736,9 +5588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4762,9 +5622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4774,9 +5642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4800,9 +5676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4826,9 +5710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4875,14 +5767,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4896,9 +5796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4919,9 +5827,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4942,9 +5858,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4965,9 +5889,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4975,9 +5907,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4998,9 +5938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5021,9 +5969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5062,14 +6018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5084,9 +6048,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5109,9 +6081,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5134,9 +6114,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5170,9 +6166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5195,9 +6199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5220,9 +6232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5256,14 +6276,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5276,9 +6304,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5297,9 +6333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5318,9 +6362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5339,18 +6391,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5369,9 +6437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5390,9 +6466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5418,14 +6502,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5437,9 +6529,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5456,9 +6556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5475,9 +6583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5494,17 +6610,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5521,9 +6653,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5540,9 +6680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5564,14 +6712,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5585,9 +6741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5607,9 +6771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5629,9 +6801,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5651,18 +6831,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5682,9 +6878,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5704,9 +6908,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5732,14 +6944,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5752,9 +6972,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5772,9 +7000,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5792,9 +7028,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5812,17 +7056,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5840,9 +7100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5860,9 +7128,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5884,13 +7160,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_4x8', ` +#else .macro COPYB_4x8 +#endif lxvw4x vs5, o0, BO @@ -5993,10 +7277,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs54, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`COPYB_1x8', ` +#else .macro COPYB_1x8 +#endif lxvw4x vs5, o0, BO @@ -6026,5 +7318,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs14, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S index 53f9c8b82..ed592a604 100644 --- a/kernel/power/sgemm_tcopy_macros_16_power8.S +++ b/kernel/power/sgemm_tcopy_macros_16_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x16', ` +#else .macro COPY_4x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs46, o32, T1 stxvw4x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x16', ` +#else .macro COPY_2x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x16', ` +#else .macro COPY_1x16 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S index 1b71d5bb3..f80f095dc 100644 --- a/kernel/power/sgemm_tcopy_macros_8_power8.S +++ b/kernel/power/sgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs38, o32, T1 stxvw4x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvw4x vs32, o0, A0 @@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs38, o0, T1 stxsspx vs39, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxsspx vs32, o0, A0 @@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs34, o32, T1 stxvw4x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvw4x vs32, o0, A0 @@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs34, o0, T1 stxsspx vs35, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxsspx vs32, o0, A0 @@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvw4x vs32, o0, A0 lxvw4x vs33, o16, A0 @@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 stxvw4x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvw4x vs32, o0, A0 @@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvw4x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxsspx vs32, o0, A0 lxsspx vs33, o4, A0 @@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 stxsspx vs33, o4, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxsspx vs32, o0, A0 @@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxsspx vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c index 6eecb60a1..329a8cd06 100644 --- a/kernel/power/srot_microk_power8.c +++ b/kernel/power/srot_microk_power8.c @@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 64 \n\t" "addic. %2, %2, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" @@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 41, 33, 36 \n\t" diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c index 058ff3399..88fba3166 100644 --- a/kernel/power/sscal_microk_power8.c +++ b/kernel/power/sscal_microk_power8.c @@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha) "addi %2, %2, 256 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmulsp 40, 32, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t" @@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x) ( "xxlxor %x3, %x3, %x3 \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t" @@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" : diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c index cfefdd6ef..a407018a8 100644 --- a/kernel/power/sswap_microk_power8.c +++ b/kernel/power/sswap_microk_power8.c @@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" @@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -32 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S index 27bc1e89c..6c016d6fa 100644 --- a/kernel/power/strmm_macros_16x8_power8.S +++ b/kernel/power/strmm_macros_16x8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=8 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x16_1', ` +#else .macro LOAD8x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_I1', ` +#else .macro KERNEL8x16_I1 +#endif lxvw4x vs4, o0, AO @@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_1', ` +#else .macro KERNEL8x16_1 +#endif lxvw4x vs4, o0, AO @@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_2', ` +#else .macro KERNEL8x16_2 +#endif lxvw4x vs0, o0, AO @@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_E2', ` +#else .macro KERNEL8x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs7, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUBI1', ` +#else .macro KERNEL8x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x16_SUB1', ` +#else .macro KERNEL8x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs63, vs3, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x16', ` +#else .macro SAVE8x16 +#endif mr T1, CO @@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x8_1', ` +#else .macro LOAD8x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_I1', ` +#else .macro KERNEL8x8_I1 +#endif lxvw4x vs4, o0, AO @@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_1', ` +#else .macro KERNEL8x8_1 +#endif lxvw4x vs4, o0, AO @@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_2', ` +#else .macro KERNEL8x8_2 +#endif lxvw4x vs0, o0, AO @@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_E2', ` +#else .macro KERNEL8x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUBI1', ` +#else .macro KERNEL8x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x8_SUB1', ` +#else .macro KERNEL8x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x8', ` +#else .macro SAVE8x8 +#endif mr T1, CO @@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x4_1', ` +#else .macro LOAD8x4_1 +#endif lxvw4x vs0, o0, AO @@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_I1', ` +#else .macro KERNEL8x4_I1 +#endif lxvw4x vs4, o0, AO @@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_1', ` +#else .macro KERNEL8x4_1 +#endif lxvw4x vs4, o0, AO @@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_2', ` +#else .macro KERNEL8x4_2 +#endif lxvw4x vs0, o0, AO @@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_E2', ` +#else .macro KERNEL8x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUBI1', ` +#else .macro KERNEL8x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x4_SUB1', ` +#else .macro KERNEL8x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x4', ` +#else .macro SAVE8x4 +#endif mr T1, CO @@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x2_1', ` +#else .macro LOAD8x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -1632,9 +1828,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_I1', ` +#else .macro KERNEL8x2_I1 +#endif lxsspx vs4, o0, AO @@ -1684,9 +1888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_1', ` +#else .macro KERNEL8x2_1 +#endif lxsspx vs4, o0, AO @@ -1736,9 +1948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_2', ` +#else .macro KERNEL8x2_2 +#endif lxsspx vs0, o0, AO @@ -1788,9 +2008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_E2', ` +#else .macro KERNEL8x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -1818,9 +2046,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs5, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUBI1', ` +#else .macro KERNEL8x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -1870,9 +2106,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x2_SUB1', ` +#else .macro KERNEL8x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -1922,9 +2166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs47, vs1, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x2', ` +#else .macro SAVE8x2 +#endif mr T1, CO @@ -2097,14 +2349,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=8 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD8x1_1', ` +#else .macro LOAD8x1_1 +#endif lxsspx vs0, o0, AO @@ -2126,9 +2386,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_I1', ` +#else .macro KERNEL8x1_I1 +#endif lxsspx vs4, o0, AO @@ -2169,9 +2437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_1', ` +#else .macro KERNEL8x1_1 +#endif lxsspx vs4, o0, AO @@ -2212,9 +2488,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_2', ` +#else .macro KERNEL8x1_2 +#endif lxsspx vs0, o0, AO @@ -2255,9 +2539,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_E2', ` +#else .macro KERNEL8x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -2277,9 +2569,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs4, vs23 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUBI1', ` +#else .macro KERNEL8x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -2320,9 +2620,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL8x1_SUB1', ` +#else .macro KERNEL8x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -2363,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs0, vs15 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE8x1', ` +#else .macro SAVE8x1 +#endif mr T1, CO @@ -2506,14 +2822,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x16_1', ` +#else .macro LOAD4x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2531,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_I1', ` +#else .macro KERNEL4x16_I1 +#endif lxvw4x vs4, o0, AO @@ -2574,9 +2906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_1', ` +#else .macro KERNEL4x16_1 +#endif lxvw4x vs4, o0, AO @@ -2617,9 +2957,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_2', ` +#else .macro KERNEL4x16_2 +#endif lxvw4x vs0, o0, AO @@ -2660,9 +3008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_E2', ` +#else .macro KERNEL4x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -2686,9 +3042,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs7, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUBI1', ` +#else .macro KERNEL4x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -2729,9 +3093,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x16_SUB1', ` +#else .macro KERNEL4x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -2772,9 +3144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs47, vs3, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x16', ` +#else .macro SAVE4x16 +#endif mr T1, CO @@ -2895,14 +3275,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x8_1', ` +#else .macro LOAD4x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -2918,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_I1', ` +#else .macro KERNEL4x8_I1 +#endif lxvw4x vs4, o0, AO @@ -2951,9 +3347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_1', ` +#else .macro KERNEL4x8_1 +#endif lxvw4x vs4, o0, AO @@ -2984,9 +3388,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_2', ` +#else .macro KERNEL4x8_2 +#endif lxvw4x vs0, o0, AO @@ -3017,9 +3429,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_E2', ` +#else .macro KERNEL4x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3035,9 +3455,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUBI1', ` +#else .macro KERNEL4x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3068,9 +3496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x8_SUB1', ` +#else .macro KERNEL4x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3101,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x8', ` +#else .macro SAVE4x8 +#endif mr T1, CO @@ -3192,14 +3636,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x4_1', ` +#else .macro LOAD4x4_1 +#endif lxvw4x vs0, o0, AO @@ -3214,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_I1', ` +#else .macro KERNEL4x4_I1 +#endif lxvw4x vs4, o0, AO @@ -3242,9 +3702,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_1', ` +#else .macro KERNEL4x4_1 +#endif lxvw4x vs4, o0, AO @@ -3270,9 +3738,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_2', ` +#else .macro KERNEL4x4_2 +#endif lxvw4x vs0, o0, AO @@ -3298,9 +3774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_E2', ` +#else .macro KERNEL4x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -3312,9 +3796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUBI1', ` +#else .macro KERNEL4x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -3340,9 +3832,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x4_SUB1', ` +#else .macro KERNEL4x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -3368,9 +3868,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x4', ` +#else .macro SAVE4x4 +#endif mr T1, CO @@ -3443,14 +3951,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x2_1', ` +#else .macro LOAD4x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -3466,9 +3982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_I1', ` +#else .macro KERNEL4x2_I1 +#endif lxsspx vs4, o0, AO @@ -3499,9 +4023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_1', ` +#else .macro KERNEL4x2_1 +#endif lxsspx vs4, o0, AO @@ -3532,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_2', ` +#else .macro KERNEL4x2_2 +#endif lxsspx vs0, o0, AO @@ -3565,9 +4105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_E2', ` +#else .macro KERNEL4x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3583,9 +4131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs5, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUBI1', ` +#else .macro KERNEL4x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3616,9 +4172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x2_SUB1', ` +#else .macro KERNEL4x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -3649,9 +4213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs39, vs1, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x2', ` +#else .macro SAVE4x2 +#endif mr T1, CO @@ -3740,14 +4312,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD4x1_1', ` +#else .macro LOAD4x1_1 +#endif lxsspx vs0, o0, AO @@ -3762,9 +4342,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_I1', ` +#else .macro KERNEL4x1_I1 +#endif lxsspx vs4, o0, AO @@ -3790,9 +4378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_1', ` +#else .macro KERNEL4x1_1 +#endif lxsspx vs4, o0, AO @@ -3818,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_2', ` +#else .macro KERNEL4x1_2 +#endif lxsspx vs0, o0, AO @@ -3846,9 +4450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_E2', ` +#else .macro KERNEL4x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -3860,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs4, vs19 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUBI1', ` +#else .macro KERNEL4x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -3888,9 +4508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL4x1_SUB1', ` +#else .macro KERNEL4x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -3916,9 +4544,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs0, vs11 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE4x1', ` +#else .macro SAVE4x1 +#endif mr T1, CO @@ -3991,14 +4627,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x16_1', ` +#else .macro LOAD2x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4014,9 +4658,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_I1', ` +#else .macro KERNEL2x16_I1 +#endif lxvw4x vs4, o0, AO @@ -4045,9 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_1', ` +#else .macro KERNEL2x16_1 +#endif lxvw4x vs4, o0, AO @@ -4076,9 +4736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_2', ` +#else .macro KERNEL2x16_2 +#endif lxvw4x vs0, o0, AO @@ -4107,9 +4775,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_E2', ` +#else .macro KERNEL2x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4123,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs7, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUBI1', ` +#else .macro KERNEL2x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4154,9 +4838,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x16_SUB1', ` +#else .macro KERNEL2x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4185,9 +4877,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs39, vs3, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x16', ` +#else .macro SAVE2x16 +#endif mr T1, CO @@ -4250,14 +4950,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -4271,9 +4979,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvw4x vs4, o0, AO @@ -4296,9 +5012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvw4x vs4, o0, AO @@ -4321,9 +5045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvw4x vs0, o0, AO @@ -4346,9 +5078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4358,9 +5098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4383,9 +5131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4408,9 +5164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -4457,14 +5221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvw4x vs0, o0, AO @@ -4477,9 +5249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvw4x vs4, o0, AO @@ -4499,9 +5279,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvw4x vs4, o0, AO @@ -4521,9 +5309,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvw4x vs0, o0, AO @@ -4543,9 +5339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -4553,9 +5357,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -4575,9 +5387,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -4597,9 +5417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -4638,14 +5466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -4659,9 +5495,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxsspx vs4, o0, AO @@ -4684,9 +5528,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxsspx vs4, o0, AO @@ -4709,9 +5561,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxsspx vs0, o0, AO @@ -4734,9 +5594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4746,9 +5614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs5, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4771,9 +5647,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -4796,9 +5680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs35, vs1, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -4845,14 +5737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxsspx vs0, o0, AO @@ -4865,9 +5765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxsspx vs4, o0, AO @@ -4887,9 +5795,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxsspx vs4, o0, AO @@ -4909,9 +5825,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxsspx vs0, o0, AO @@ -4931,9 +5855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xsmaddadp vs32, vs4, vs16 @@ -4941,9 +5873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs4, vs17 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -4963,9 +5903,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -4985,9 +5933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs0, vs9 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -5026,14 +5982,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=16 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x16_1', ` +#else .macro LOAD1x16_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5048,9 +6012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_I1', ` +#else .macro KERNEL1x16_I1 +#endif lxvw4x vs4, o0, AO @@ -5073,9 +6045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_1', ` +#else .macro KERNEL1x16_1 +#endif lxvw4x vs4, o0, AO @@ -5098,9 +6078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_2', ` +#else .macro KERNEL1x16_2 +#endif lxvw4x vs0, o0, AO @@ -5123,9 +6111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_E2', ` +#else .macro KERNEL1x16_E2 +#endif xvmaddasp vs32, vs4, vs16 @@ -5134,9 +6130,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs7, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUBI1', ` +#else .macro KERNEL1x16_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5159,9 +6163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x16_SUB1', ` +#else .macro KERNEL1x16_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5184,9 +6196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs35, vs3, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x16', ` +#else .macro SAVE1x16 +#endif mr T1, CO @@ -5220,14 +6240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvw4x vs0, o0, AO lxvw4x vs1, o16, AO @@ -5240,9 +6268,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvw4x vs4, o0, AO @@ -5261,9 +6297,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvw4x vs4, o0, AO @@ -5282,9 +6326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvw4x vs0, o0, AO @@ -5303,18 +6355,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddasp vs32, vs4, vs16 xvmaddasp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5333,9 +6401,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5354,9 +6430,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -5382,14 +6466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvw4x vs0, o0, AO @@ -5401,9 +6493,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvw4x vs4, o0, AO @@ -5420,9 +6520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvw4x vs4, o0, AO @@ -5439,9 +6547,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvw4x vs0, o0, AO @@ -5458,17 +6574,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddasp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvw4x vs0, o0, AO @@ -5485,9 +6617,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmulsp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvw4x vs0, o0, AO @@ -5504,9 +6644,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddasp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -5528,14 +6676,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxsspx vs0, o0, AO lxsspx vs1, o4, AO @@ -5548,9 +6704,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxsspx vs4, o0, AO @@ -5569,9 +6733,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxsspx vs4, o0, AO @@ -5590,9 +6762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxsspx vs0, o0, AO @@ -5611,18 +6791,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xsmaddadp vs32, vs4, vs16 xsmaddadp vs33, vs5, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5641,9 +6837,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxsspx vs0, o0, AO @@ -5662,9 +6866,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs33, vs1, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -5690,14 +6902,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 8 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxsspx vs0, o0, AO @@ -5709,9 +6929,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi BO, BO, 4 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxsspx vs4, o0, AO @@ -5728,9 +6956,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxsspx vs4, o0, AO @@ -5747,9 +6983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxsspx vs0, o0, AO @@ -5766,17 +7010,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xsmaddadp vs32, vs4, vs16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxsspx vs0, o0, AO @@ -5793,9 +7053,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmuldp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxsspx vs0, o0, AO @@ -5812,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xsmaddadp vs32, vs0, vs8 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -5836,5 +7112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi CO, CO, 4 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c index 82366902d..3f0af4232 100644 --- a/kernel/power/zasum_microk_power8.c +++ b/kernel/power/zasum_microk_power8.c @@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 38, 38, %x5 \n\t" "xvadddp 39, 39, %x6 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvabsdp 48, 40 \n\t" "xvabsdp 49, 41 \n\t" @@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x) "xvadddp 32, 32, 36 \n\t" - "xxswapd 33, 32 \n\t" + XXSWAPD_S(33,32) "xsadddp %x0, 32, 33 \n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c index 124614f62..959050e5f 100644 --- a/kernel/power/zaxpy_microk_power8.c +++ b/kernel/power/zaxpy_microk_power8.c @@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, __asm__ ( - "xxspltd 32, %x19, 0 \n\t" // alpha_r - "xxspltd 33, %x20, 0 \n\t" // alpha_i + XXSPLTD_S(32,%x19,0) // alpha_r + XXSPLTD_S(33,%x20,0) // alpha_i "lxvd2x 36, 0, %21 \n\t" // mvec @@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 - "xxswapd %x12, 44 \n\t" // exchange real and imag part - "xxswapd %x13, 45 \n\t" // exchange real and imag part - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" @@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y, "addi %16, %16, 64 \n\t" - "xxswapd %x8, 40 \n\t" // exchange real and imag part - "xxswapd %x9, 41 \n\t" // exchange real and imag part + XXSWAPD_S(%x8,40) // exchange real and imag part + XXSWAPD_S(%x9,41) // exchange real and imag part "lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 49, %22, %3 \n\t" // y1 - "xxswapd %x10, 42 \n\t" // exchange real and imag part - "xxswapd %x11, 43 \n\t" // exchange real and imag part + XXSWAPD_S(%x10,42) // exchange real and imag part + XXSWAPD_S(%x11,43) // exchange real and imag part "lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 51, %24, %3 \n\t" // y3 - "xxswapd %x12, 44 \n\t" // exchange real and imag part + XXSWAPD_S(%x12,44) // exchange real and imag part "addi %3, %3, 64 \n\t" - "xxswapd %x13, 45 \n\t" // exchange real and imag part + XXSWAPD_S(%x13,45) // exchange real and imag part "lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x5, %22, %3 \n\t" // y5 - "xxswapd %x14, 46 \n\t" // exchange real and imag part - "xxswapd %x15, 47 \n\t" // exchange real and imag part + XXSWAPD_S(%x14,46) // exchange real and imag part + XXSWAPD_S(%x15,47) // exchange real and imag part "lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x7, %24, %3 \n\t" // y7 "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 49, 41, 32 \n\t" diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c index 5ca34b633..e29547047 100644 --- a/kernel/power/zcopy_microk_power8.c +++ b/kernel/power/zcopy_microk_power8.c @@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" @@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y) "addi %2, %2, 128 \n\t" "addic. %1, %1, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "stxvd2x 32, 0, %3 \n\t" "stxvd2x 33, %5, %3 \n\t" diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c index 71078b66c..dcde82433 100644 --- a/kernel/power/zdot_microk_power8.c +++ b/kernel/power/zdot_microk_power8.c @@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 0, 48 \n\t" // y0_i, y0_r - "xxswapd 1, 49 \n\t" // y1_i, y1_r - "xxswapd 2, 50 \n\t" // y2_i, y2_r - "xxswapd 3, 51 \n\t" // y3_i, y3_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" @@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i - "xxswapd 8, 4 \n\t" // y0_i, y0_r - "xxswapd 9, 5 \n\t" // y1_i, y1_r - "xxswapd 10, 6 \n\t" // y2_i, y2_r - "xxswapd 11, 7 \n\t" // y3_i, y3_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i @@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 0,48 \n\t" // y0_i, y0_r - "xxswapd 1,49 \n\t" // y1_i, y1_r + XXSWAPD_S(0,48) // y0_i, y0_r + XXSWAPD_S(1,49) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 2,50 \n\t" // y2_i, y2_r - "xxswapd 3,51 \n\t" // y3_i, y3_r + XXSWAPD_S(2,50) // y2_i, y2_r + XXSWAPD_S(3,51) // y3_i, y3_r "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i @@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot) "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i - "xxswapd 8,4 \n\t" // y0_i, y0_r - "xxswapd 9,5 \n\t" // y1_i, y1_r + XXSWAPD_S(8,4) // y0_i, y0_r + XXSWAPD_S(9,5) // y1_i, y1_r "addi %2, %2, 64 \n\t" "addi %3, %3, 64 \n\t" - "xxswapd 10,6 \n\t" // y2_i, y2_r - "xxswapd 11,7 \n\t" // y3_i, y3_r + XXSWAPD_S(10,6) // y2_i, y2_r + XXSWAPD_S(11,7) // y3_i, y3_r "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S index c43a115b2..24a36470c 100644 --- a/kernel/power/zgemm_macros_8x2_power8.S +++ b/kernel/power/zgemm_macros_8x2_power8.S @@ -67,7 +67,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -91,9 +95,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -151,9 +163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -211,9 +231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -271,9 +299,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -311,9 +347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -371,9 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -431,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -455,13 +515,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -479,13 +539,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -503,13 +563,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -527,13 +587,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -551,13 +611,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -575,13 +635,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -599,13 +659,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -623,13 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -685,13 +745,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -709,13 +769,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -733,13 +793,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -757,13 +817,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -781,13 +841,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -805,13 +865,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -829,13 +889,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -853,13 +913,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -900,14 +960,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -924,9 +992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -961,9 +1037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -998,9 +1082,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1035,9 +1127,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1059,9 +1159,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1096,9 +1204,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1133,9 +1249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1152,13 +1276,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1176,13 +1300,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1200,13 +1324,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1224,13 +1348,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1273,13 +1397,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1297,13 +1421,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1321,13 +1445,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1345,13 +1469,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1383,14 +1507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1405,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1432,9 +1572,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1459,9 +1607,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1486,9 +1642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1502,9 +1666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1529,9 +1701,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1556,9 +1736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1573,13 +1761,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1597,13 +1785,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1640,13 +1828,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1664,13 +1852,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1698,14 +1886,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1719,9 +1915,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1741,9 +1945,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1763,9 +1975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1785,9 +2005,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1797,9 +2025,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1819,9 +2055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1841,9 +2085,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1857,13 +2109,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1897,13 +2149,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1929,14 +2181,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -1958,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1999,9 +2267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2040,9 +2316,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2081,9 +2365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2104,9 +2396,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2145,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2186,9 +2494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2210,13 +2526,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2234,13 +2550,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2258,13 +2574,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2282,13 +2598,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2306,13 +2622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2330,13 +2646,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2354,13 +2670,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2378,13 +2694,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2425,14 +2741,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2447,9 +2771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2473,9 +2805,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2499,9 +2839,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2525,9 +2873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2540,9 +2896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2566,9 +2930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2592,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2611,13 +2991,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2635,13 +3015,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2659,13 +3039,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2683,13 +3063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2721,14 +3101,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2741,9 +3129,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2761,9 +3157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2781,9 +3185,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2801,9 +3213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2812,9 +3232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2832,9 +3260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2852,9 +3288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2869,13 +3313,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2893,13 +3337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2927,14 +3371,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvd2x vs16, o0, BO // load real part from B lxvd2x vs17, o16, BO // load imag part from B @@ -2946,9 +3398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2963,9 +3423,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2980,9 +3448,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -2997,18 +3473,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3023,9 +3515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3040,9 +3540,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3056,13 +3564,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3088,11 +3596,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_1x1', ` +#else .macro ZCOPYB_1x1 +#endif lxvdsx vs4, o0, BO // b0_r lxvdsx vs5, o8, BO // b0_i @@ -3101,10 +3617,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs5, o16, BBO addi BBO, BBO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`ZCOPYB_8x1', ` +#else .macro ZCOPYB_8x1 +#endif lxvd2x vs32, o0, BO lxvd2x vs33, o16, BO @@ -3118,23 +3642,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. lxvd2x vs39, o48, BO addi BO, BO, 64 - xxspltd vs40, vs32, 0 - xxspltd vs41, vs32, 1 - xxspltd vs42, vs33, 0 - xxspltd vs43, vs33, 1 - xxspltd vs44, vs34, 0 - xxspltd vs45, vs34, 1 - xxspltd vs46, vs35, 0 - xxspltd vs47, vs35, 1 + XXSPLTD(vs40,vs32,0) + XXSPLTD(vs41,vs32,1) + XXSPLTD(vs42,vs33,0) + XXSPLTD(vs43,vs33,1) + XXSPLTD(vs44,vs34,0) + XXSPLTD(vs45,vs34,1) + XXSPLTD(vs46,vs35,0) + XXSPLTD(vs47,vs35,1) - xxspltd vs48, vs36, 0 - xxspltd vs49, vs36, 1 - xxspltd vs50, vs37, 0 - xxspltd vs51, vs37, 1 - xxspltd vs52, vs38, 0 - xxspltd vs53, vs38, 1 - xxspltd vs54, vs39, 0 - xxspltd vs55, vs39, 1 + XXSPLTD(vs48,vs36,0) + XXSPLTD(vs49,vs36,1) + XXSPLTD(vs50,vs37,0) + XXSPLTD(vs51,vs37,1) + XXSPLTD(vs52,vs38,0) + XXSPLTD(vs53,vs38,1) + XXSPLTD(vs54,vs39,0) + XXSPLTD(vs55,vs39,1) stxvd2x vs40, o0, BBO stxvd2x vs41, o16, BBO @@ -3160,6 +3684,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs55, o48, BBO addi BBO, BBO, 64 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S index 3f5a5ed03..654332375 100644 --- a/kernel/power/zgemm_tcopy_macros_8_power8.S +++ b/kernel/power/zgemm_tcopy_macros_8_power8.S @@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=4 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x8', ` +#else .macro COPY_4x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs12, o32, T1 stxvd2x vs13, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x4', ` +#else .macro COPY_4x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x2', ` +#else .macro COPY_4x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=4 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_4x1', ` +#else .macro COPY_4x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x8', ` +#else .macro COPY_2x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs46, o32, T1 stxvd2x vs47, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x4', ` +#else .macro COPY_2x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x2', ` +#else .macro COPY_2x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_2x1', ` +#else .macro COPY_2x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x8', ` +#else .macro COPY_1x8 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs38, o32, T1 stxvd2x vs39, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x4', ` +#else .macro COPY_1x4 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs34, o32, T1 stxvd2x vs35, o48, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x2', ` +#else .macro COPY_1x2 +#endif lxvd2x vs32, o0, A0 lxvd2x vs33, o16, A0 @@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 stxvd2x vs33, o16, T1 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`COPY_1x1', ` +#else .macro COPY_1x1 +#endif lxvd2x vs32, o0, A0 addi A0, A0, 16 @@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stxvd2x vs32, o0, T1 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c index d45468fd5..c6d666178 100644 --- a/kernel/power/zrot.c +++ b/kernel/power/zrot.c @@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si __asm__ ( - "xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords - "xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords + XXSPLTD_S(36,%x[cos],0) // load c to both dwords + XXSPLTD_S(37,%x[sin],0) // load s to both dwords "lxvd2x 32, 0, %[x_ptr] \n\t" // load x "lxvd2x 33, %[i16], %[x_ptr] \n\t" @@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 64 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" @@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si "addi %[y_ptr], %[y_ptr], 128 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t" - "bgt+ 1b \n" + "bgt+ one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 41, 33, 36 \n\t" diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c index aba9029a0..567331775 100644 --- a/kernel/power/zscal_microk_power8.c +++ b/kernel/power/zscal_microk_power8.c @@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "dcbt 0, %2 \n\t" "xsnegdp 33, %x16 \n\t" // -alpha_i - "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r - "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i + XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r + XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 41, %17, %2 \n\t" @@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 128 \n\t" "addic. %1, %1, -8 \n\t" - "ble 2f \n\t" + "ble two%= \n\t" - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x8, %x8, 33 \n\t" @@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "addi %2, %2, 256 \n\t" "addic. %1, %1, -8 \n\t" - "bgt 1b \n" + "bgt one%= \n" - "2: \n\t" + "two%=: \n\t" "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 49, 41, 32 \n\t" @@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i) "xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t" - "xxswapd %x7, 40 \n\t" - "xxswapd %x8, 41 \n\t" - "xxswapd %x9, 42 \n\t" - "xxswapd %x10, 43 \n\t" - "xxswapd %x11, 44 \n\t" - "xxswapd %x12, 45 \n\t" - "xxswapd %x13, 46 \n\t" - "xxswapd %x14, 47 \n\t" + XXSWAPD_S(%x7,40) + XXSWAPD_S(%x8,41) + XXSWAPD_S(%x9,42) + XXSWAPD_S(%x10,43) + XXSWAPD_S(%x11,44) + XXSWAPD_S(%x12,45) + XXSWAPD_S(%x13,46) + XXSWAPD_S(%x14,47) "addi %2, %2, -128 \n\t" diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c index 54391ba5d..1e9fbe2cf 100644 --- a/kernel/power/zswap_microk_power8.c +++ b/kernel/power/zswap_microk_power8.c @@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y) { __asm__ ( - ".p2align 5 \n" - "1: \n\t" + ".align 5 \n" + "one%=: \n\t" "lxvd2x 32, 0, %4 \n\t" "lxvd2x 33, %5, %4 \n\t" "lxvd2x 34, %6, %4 \n\t" @@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y) "addi %4, %4, 128 \n\t" "addic. %2, %2, -16 \n\t" - "bgt 1b \n" + "bgt one%= \n" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" : diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S index 701ec65c8..b3fbcd220 100644 --- a/kernel/power/ztrmm_macros_8x2_power8.S +++ b/kernel/power/ztrmm_macros_8x2_power8.S @@ -68,7 +68,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Macros for N=2 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x8_1', ` +#else .macro LOAD2x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -92,9 +96,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_I1', ` +#else .macro KERNEL2x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -152,9 +164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_1', ` +#else .macro KERNEL2x8_1 +#endif xvmaddadp vs32, vs0, vs16 // real*real, imag*real @@ -221,9 +241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_2', ` +#else .macro KERNEL2x8_2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -289,9 +317,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 addi BO, BO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_E2', ` +#else .macro KERNEL2x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -329,9 +365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUBI1', ` +#else .macro KERNEL2x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x8_SUB1', ` +#else .macro KERNEL2x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -449,9 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x8', ` +#else .macro SAVE2x8 +#endif mr T1, CO @@ -473,13 +533,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -497,13 +557,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -521,13 +581,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -545,13 +605,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -569,13 +629,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -593,13 +653,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -617,13 +677,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -641,13 +701,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -703,13 +763,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs49, vs49 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs48 // realA*realB XSFADD_R2 vs0, vs0, vs49 // imagA*imagB - xxswapd vs48, vs48 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs49, vs49 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs48 // realA*imagB XSFADD_I2 vs1, vs1, vs49 // imagA*realB @@ -727,13 +787,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs51, vs51 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs50 // realA*realB XSFADD_R2 vs0, vs0, vs51 // imagA*imagB - xxswapd vs50, vs50 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs51, vs51 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs50 // realA*imagB XSFADD_I2 vs1, vs1, vs51 // imagA*realB @@ -751,13 +811,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs53, vs53 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs52 // realA*realB XSFADD_R2 vs0, vs0, vs53 // imagA*imagB - xxswapd vs52, vs52 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs53, vs53 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs52 // realA*imagB XSFADD_I2 vs1, vs1, vs53 // imagA*realB @@ -775,13 +835,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs55, vs55 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs54 // realA*realB XSFADD_R2 vs0, vs0, vs55 // imagA*imagB - xxswapd vs54, vs54 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs55, vs55 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs54 // realA*imagB XSFADD_I2 vs1, vs1, vs55 // imagA*realB @@ -799,13 +859,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs57, vs57 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs56 // realA*realB XSFADD_R2 vs0, vs0, vs57 // imagA*imagB - xxswapd vs56, vs56 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs57, vs57 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs56 // realA*imagB XSFADD_I2 vs1, vs1, vs57 // imagA*realB @@ -823,13 +883,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs59, vs59 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs58 // realA*realB XSFADD_R2 vs0, vs0, vs59 // imagA*imagB - xxswapd vs58, vs58 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs59, vs59 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs58 // realA*imagB XSFADD_I2 vs1, vs1, vs59 // imagA*realB @@ -847,13 +907,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs61, vs61 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs60 // realA*realB XSFADD_R2 vs0, vs0, vs61 // imagA*imagB - xxswapd vs60, vs60 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs61, vs61 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs60 // realA*imagB XSFADD_I2 vs1, vs1, vs61 // imagA*realB @@ -871,13 +931,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs63, vs63 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs62 // realA*realB XSFADD_R2 vs0, vs0, vs63 // imagA*imagB - xxswapd vs62, vs62 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs63, vs63 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs62 // realA*imagB XSFADD_I2 vs1, vs1, vs63 // imagA*realB @@ -918,14 +978,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x4_1', ` +#else .macro LOAD2x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -942,9 +1010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_I1', ` +#else .macro KERNEL2x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -979,9 +1055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_1', ` +#else .macro KERNEL2x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1016,9 +1100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_2', ` +#else .macro KERNEL2x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1053,9 +1145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_E2', ` +#else .macro KERNEL2x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1077,9 +1177,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUBI1', ` +#else .macro KERNEL2x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1114,9 +1222,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x4_SUB1', ` +#else .macro KERNEL2x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1151,9 +1267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x4', ` +#else .macro SAVE2x4 +#endif mr T1, CO @@ -1170,13 +1294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1194,13 +1318,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1218,13 +1342,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1242,13 +1366,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1291,13 +1415,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -1315,13 +1439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -1339,13 +1463,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -1363,13 +1487,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -1401,14 +1525,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x2_1', ` +#else .macro LOAD2x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1423,9 +1555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_I1', ` +#else .macro KERNEL2x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1450,9 +1590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_1', ` +#else .macro KERNEL2x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -1477,9 +1625,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_2', ` +#else .macro KERNEL2x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1504,9 +1660,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_E2', ` +#else .macro KERNEL2x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1520,9 +1684,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUBI1', ` +#else .macro KERNEL2x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1547,9 +1719,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x2_SUB1', ` +#else .macro KERNEL2x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -1574,9 +1754,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x2', ` +#else .macro SAVE2x2 +#endif mr T1, CO @@ -1591,13 +1779,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1615,13 +1803,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1658,13 +1846,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -1682,13 +1870,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -1716,14 +1904,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=2 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD2x1_1', ` +#else .macro LOAD2x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1737,9 +1933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_I1', ` +#else .macro KERNEL2x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1759,9 +1963,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_1', ` +#else .macro KERNEL2x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -1781,9 +1993,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_2', ` +#else .macro KERNEL2x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1803,9 +2023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_E2', ` +#else .macro KERNEL2x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -1815,9 +2043,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUBI1', ` +#else .macro KERNEL2x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1837,9 +2073,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL2x1_SUB1', ` +#else .macro KERNEL2x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -1859,9 +2103,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE2x1', ` +#else .macro SAVE2x1 +#endif mr T1, CO @@ -1875,13 +2127,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -1915,13 +2167,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -1947,14 +2199,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=8 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x8_1', ` +#else .macro LOAD1x8_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -1976,9 +2236,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_I1', ` +#else .macro KERNEL1x8_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2017,9 +2285,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_1', ` +#else .macro KERNEL1x8_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2058,9 +2334,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_2', ` +#else .macro KERNEL1x8_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2099,9 +2383,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_E2', ` +#else .macro KERNEL1x8_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2122,9 +2414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUBI1', ` +#else .macro KERNEL1x8_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2163,9 +2463,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x8_SUB1', ` +#else .macro KERNEL1x8_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2204,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x8', ` +#else .macro SAVE1x8 +#endif mr T1, CO @@ -2228,13 +2544,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2252,13 +2568,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2276,13 +2592,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2300,13 +2616,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2324,13 +2640,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs41, vs41 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs40 // realA*realB XSFADD_R2 vs0, vs0, vs41 // imagA*imagB - xxswapd vs40, vs40 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs41, vs41 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs40 // realA*imagB XSFADD_I2 vs1, vs1, vs41 // imagA*realB @@ -2348,13 +2664,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs43, vs43 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs42 // realA*realB XSFADD_R2 vs0, vs0, vs43 // imagA*imagB - xxswapd vs42, vs42 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs43, vs43 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs42 // realA*imagB XSFADD_I2 vs1, vs1, vs43 // imagA*realB @@ -2372,13 +2688,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs45, vs45 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs44 // realA*realB XSFADD_R2 vs0, vs0, vs45 // imagA*imagB - xxswapd vs44, vs44 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs45, vs45 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs44 // realA*imagB XSFADD_I2 vs1, vs1, vs45 // imagA*realB @@ -2396,13 +2712,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs47, vs47 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs46 // realA*realB XSFADD_R2 vs0, vs0, vs47 // imagA*imagB - xxswapd vs46, vs46 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs47, vs47 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs46 // realA*imagB XSFADD_I2 vs1, vs1, vs47 // imagA*realB @@ -2443,14 +2759,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T2, T2, LDC addi CO, CO, 128 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=4 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x4_1', ` +#else .macro LOAD1x4_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2465,9 +2789,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 64 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_I1', ` +#else .macro KERNEL1x4_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2491,9 +2823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_1', ` +#else .macro KERNEL1x4_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2517,9 +2857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_2', ` +#else .macro KERNEL1x4_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2543,9 +2891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_E2', ` +#else .macro KERNEL1x4_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2558,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUBI1', ` +#else .macro KERNEL1x4_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2584,9 +2948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x4_SUB1', ` +#else .macro KERNEL1x4_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2610,9 +2982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x4', ` +#else .macro SAVE1x4 +#endif mr T1, CO @@ -2629,13 +3009,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2653,13 +3033,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2677,13 +3057,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs37, vs37 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs36 // realA*realB XSFADD_R2 vs0, vs0, vs37 // imagA*imagB - xxswapd vs36, vs36 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs37, vs37 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs36 // realA*imagB XSFADD_I2 vs1, vs1, vs37 // imagA*realB @@ -2701,13 +3081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs39, vs39 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs38 // realA*realB XSFADD_R2 vs0, vs0, vs39 // imagA*imagB - xxswapd vs38, vs38 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs39, vs39 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs38 // realA*imagB XSFADD_I2 vs1, vs1, vs39 // imagA*realB @@ -2739,14 +3119,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 64 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=2 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x2_1', ` +#else .macro LOAD1x2_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2759,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 32 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_I1', ` +#else .macro KERNEL1x2_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2779,9 +3175,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_1', ` +#else .macro KERNEL1x2_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A lxvd2x vs9, o16, AO // load real,imag from A @@ -2799,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_2', ` +#else .macro KERNEL1x2_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2819,9 +3231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_E2', ` +#else .macro KERNEL1x2_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real @@ -2830,9 +3250,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUBI1', ` +#else .macro KERNEL1x2_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2850,9 +3278,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x2_SUB1', ` +#else .macro KERNEL1x2_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A lxvd2x vs1, o16, AO // load real,imag from A @@ -2870,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x2', ` +#else .macro SAVE1x2 +#endif mr T1, CO @@ -2887,13 +3331,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -2911,13 +3355,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs35, vs35 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs34 // realA*realB XSFADD_R2 vs0, vs0, vs35 // imagA*imagB - xxswapd vs34, vs34 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs35, vs35 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs34 // realA*imagB XSFADD_I2 vs1, vs1, vs35 // imagA*realB @@ -2945,14 +3389,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 32 +#if defined(_AIX) +') +#else .endm +#endif /********************************************************************************************** * Macros for N=1 and M=1 **********************************************************************************************/ +#if defined(_AIX) +define(`LOAD1x1_1', ` +#else .macro LOAD1x1_1 +#endif lxvdsx vs16, o0, BO // load real part from B lxvdsx vs17, o8, BO // load imag part from B @@ -2964,9 +3416,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. addi AO, AO, 16 +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_I1', ` +#else .macro KERNEL1x1_I1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2981,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_1', ` +#else .macro KERNEL1x1_1 +#endif lxvd2x vs8, o0, AO // load real,imag from A @@ -2998,9 +3466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_2', ` +#else .macro KERNEL1x1_2 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3015,18 +3491,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_E2', ` +#else .macro KERNEL1x1_E2 +#endif xvmaddadp vs32, vs8, vs20 // real*real, imag*real xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUBI1', ` +#else .macro KERNEL1x1_SUBI1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3041,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmuldp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`KERNEL1x1_SUB1', ` +#else .macro KERNEL1x1_SUB1 +#endif lxvd2x vs0, o0, AO // load real,imag from A @@ -3058,9 +3558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag +#if defined(_AIX) +') +#else .endm +#endif +#if defined(_AIX) +define(`SAVE1x1', ` +#else .macro SAVE1x1 +#endif mr T1, CO @@ -3074,13 +3582,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. xxlxor vs0, vs0, vs0 xxlxor vs1, vs1, vs1 - xxswapd vs33, vs33 // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB + XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB XSFADD_R1 vs0, vs0, vs32 // realA*realB XSFADD_R2 vs0, vs0, vs33 // imagA*imagB - xxswapd vs32, vs32 // realA*realB, imagA*realB -> imagA*realB, realA*realB - xxswapd vs33, vs33 // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB + XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB + XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB XSFADD_I1 vs1, vs1, vs32 // realA*imagB XSFADD_I2 vs1, vs1, vs33 // imagA*realB @@ -3106,5 +3614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add T1, T1, LDC addi CO, CO, 16 +#if defined(_AIX) +') +#else .endm +#endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index f98728a41..bdebd22b9 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN index be4503d47..025db515e 100644 --- a/kernel/x86_64/KERNEL.ZEN +++ b/kernel/x86_64/KERNEL.ZEN @@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CTRMMKERNEL = cgemm_kernel_8x2_haswell.S -CGEMMKERNEL = cgemm_kernel_8x2_haswell.S +CGEMMKERNEL = cgemm_kernel_8x2_haswell.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S -ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S +ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c @@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c -CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c new file mode 100644 index 000000000..01fbf3064 --- /dev/null +++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c @@ -0,0 +1,279 @@ +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */ +/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */ + +#include "common.h" +#include + +//recommended settings: GEMM_P = 320, GEMM_Q = 320. + +/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */ +#define KERNEL_k1m8n1 \ + "vmovups (%0),%%ymm1; addq $32,%0;"\ + "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m8n2 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;" +#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;" +#define KERNEL_h_k1m8n4 \ + KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;" +#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;" +#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";" +#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1) +#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;" +#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2) +#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;" +#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;" +#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;" +#define unit_init_m8n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11) +#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15) +#define SAVE_m8n1 \ + "vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\ + "vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);" +#define unit_save_m8n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\ + "vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\ + "vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\ + "vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\ + "vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\ + "vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5) +#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7) +#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11) +#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15) +#define COMPUTE_m8(ndim) \ + INIT_m8n##ndim\ + "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\ + "cmpq $24,%4; jb "#ndim"882f;"\ + #ndim"881:\n\t"\ + "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "prefetcht1 (%8); addq $16,%8;"\ + "subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\ + "movq %2,%5;"\ + #ndim"882:\n\t"\ + "testq %4,%4; jz "#ndim"883f;"\ + "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\ + KERNEL_k1m8n##ndim\ + "decq %4; jmp "#ndim"882b;"\ + #ndim"883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\ + SAVE_m8n##ndim "addq $64,%2;" + +/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */ +#define KERNEL_k1m4n1 \ + "vmovups (%0),%%xmm1; addq $16,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\ + "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;" +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;" +#define KERNEL_h_k1m4n4 \ + KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \ + "vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";" +#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1) +#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;" +#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2) +#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;" +#define unit_init_m4n4(c1,c2,c3,c4) \ + "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";" +#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11) +#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15) +#define SAVE_m4n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\ + "vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);" +#define unit_save_m4n2(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\ + "vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\ + "vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\ + "vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\ + "vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5) +#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7) +#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11) +#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15) +#define COMPUTE_m4(ndim) \ + INIT_m4n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"442:\n\t"\ + "testq %4,%4; jz "#ndim"443f;"\ + KERNEL_k1m4n##ndim\ + "decq %4; jmp "#ndim"442b;"\ + #ndim"443:\n\t"\ + SAVE_m4n##ndim "addq $32,%2;" + +/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */ +#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m2n1 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "addq $4,%1;" +#define SAVE_m2n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);" +#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define KERNEL_k1m2n2 \ + "vmovsd (%0),%%xmm1; addq $8,%0;"\ + "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\ + "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\ + "addq $8,%1;" +#define SAVE_m2n2 SAVE_m2n1 \ + "vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);" +#define INIT_m2n4 INIT_m2n2 +#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;" +#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;" +#define KERNEL_k1m2n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ + "addq $8,%0;" +#define KERNEL_k1m2n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\ + "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\ + "addq $8,%0;" +#define KERNEL_k1m2n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\ + "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\ + "addq $8,%0;" +#define unit_save_m2n4(c1,c2) \ + "vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\ + "vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;"\ + "vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\ + "vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\ + "leaq (%5,%3,2),%5;" +#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5) +#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7) +#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9) +#define COMPUTE_m2(ndim) \ + INIT_m2n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"222:\n\t"\ + "testq %4,%4; jz "#ndim"223f;"\ + KERNEL_k1m2n##ndim\ + "decq %4; jmp "#ndim"222b;"\ + #ndim"223:\n\t"\ + SAVE_m2n##ndim "addq $16,%2;" + +/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */ +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;" +#define KERNEL_k1m1n1 \ + "vmovss (%1),%%xmm3; addq $4,%1;"\ + "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n1 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);" +#define INIT_m1n2 INIT_m1n1 +#define KERNEL_k1m1n2 \ + "vmovsd (%1),%%xmm3; addq $8,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define SAVE_m1n2 \ + "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\ + "vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);" +#define INIT_m1n4 INIT_m1n2 +#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;" +#define KERNEL_k1m1n4 \ + "vmovups (%1),%%xmm3; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\ + "addq $4,%0;" +#define KERNEL_k1m1n8 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\ + "addq $4,%0;" +#define KERNEL_k1m1n12 \ + "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\ + "vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\ + "addq $4,%0;" +#define unit_save_m1n4(c1) \ + "vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\ + "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\ + "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\ + "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;" +#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4) +#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5) +#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6) +#define COMPUTE_m1(ndim) \ + INIT_m1n##ndim\ + "movq %%r13,%4; movq %%r14,%1;"\ + #ndim"112:\n\t"\ + "testq %4,%4; jz "#ndim"113f;"\ + KERNEL_k1m1n##ndim\ + "decq %4; jmp "#ndim"112b;"\ + #ndim"113:\n\t"\ + SAVE_m1n##ndim "addq $8,%2;" + +/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */ +/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */ +/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */ + +#define COMPUTE(ndim) {\ + next_b = b_pointer + ndim * K;\ + __asm__ __volatile__(\ + "vbroadcastsd (%6),%%ymm0;"\ + "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\ + "cmpq $8,%7;jb 33101"#ndim"f;"\ + "33109"#ndim":\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\ + "33101"#ndim":\n\t"\ + "cmpq $4,%7;jb 33103"#ndim"f;"\ + COMPUTE_m4(ndim)\ + "subq $4,%7;"\ + "33103"#ndim":\n\t"\ + "cmpq $2,%7;jb 33104"#ndim"f;"\ + COMPUTE_m2(ndim)\ + "subq $2,%7;"\ + "33104"#ndim":\n\t"\ + "testq %7,%7;jz 33105"#ndim"f;"\ + COMPUTE_m1(ndim)\ + "33105"#ndim":\n\t"\ + "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\ + :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\ + ::"r11","r12","r13","r14","r15",\ + "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\ + a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; + float constval[2]; constval[0] = alphar; constval[1] = alphai; + float *const_val=constval; + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; + for(;n_count>11;n_count-=12) COMPUTE(12) + for(;n_count>7;n_count-=8) COMPUTE(8) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c new file mode 100644 index 000000000..eab8c9ea5 --- /dev/null +++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c @@ -0,0 +1,292 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */ + +/* m=8, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m8n1 and m8n2 */ +#define KERNEL_k1m8n1 \ + "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\ + "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m8n2 \ + "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\ + "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\ + "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m8n4 and m8n6 */ +#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\ + "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m8n4 \ + "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1) +#define KERNEL_2_k1m8n4 \ + "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\ + acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1) +#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2) +#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2) +#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;" +#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m8n1 zero_4ymm(4,5,6,7) +#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m8n2 SAVE_m8n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m8n4 save_init_m8\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m8(ndim) \ + "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"8881:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\ + #ndim"8882:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\ + #ndim"8883:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim + +/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m4n1 \ + "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\ + "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;" +#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m4n2 \ + "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;" +#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;" +#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;" +#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m4n2 zero_4ymm(4,5,6,7) +#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4442f;"\ + #ndim"4441:\n\t"\ + KERNEL_k1m4n##ndim\ + "decq %5; jnz "#ndim"4441b;"\ + #ndim"4442:\n\t"\ + SAVE_m4n##ndim + +/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" +#endif +#define KERNEL_h_k1m2n1 \ + "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\ + "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\ + "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7) +#define acc_m2n2_exp(c1,c2,c3,c4,...)\ + "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\ + "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;" +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;" +#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";" +#define INIT_m2n1 zero_2xmm(4,5) +#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7) +#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11) +#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15) +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;" +#else + #define save_1xmm(c,tmp,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;" +#endif +#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1) +#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1) +#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim + +/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\ + "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;" +#define INIT_m1n1 zero_2xmm(4,5) +#define INIT_m1n2 zero_2xmm(4,5) +#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7) +#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9) +#if A_CONJ == 0 + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\ + "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\ + "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\ + "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\ + "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1) +#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\ + "cmpq $8,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m8(ndim)\ + "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $4,%7; jb "#ndim"9993f;"\ + COMPUTE_m4(ndim) "subq $4,%7;"\ + #ndim"9993:\n\t"\ + "cmpq $2,%7; jb "#ndim"9994f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9994:\n\t"\ + "testq %7,%7; jz "#ndim"9995f;"\ + COMPUTE_m1(ndim)\ + #ndim"9995:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; +#if A_CONJ == B_CONJ + float const_val[2] = {-alphar, -alphai}; +#else + float const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c new file mode 100644 index 000000000..3279b8b8c --- /dev/null +++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c @@ -0,0 +1,240 @@ +#include "common.h" +#include + +/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */ + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) + #define A_CONJ 0 + #define B_CONJ 0 +#endif +#if defined(RN) || defined(RT) || defined(CN) || defined(CT) + #define A_CONJ 1 + #define B_CONJ 0 +#endif +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) + #define A_CONJ 0 + #define B_CONJ 1 +#endif +#if defined(RR) || defined(RC) || defined(CR) || defined(CC) + #define A_CONJ 1 + #define B_CONJ 1 +#endif + +/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */ +/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */ + +/* m=4, ymm 0-3 temp, ymm 4-15 acc */ +#if A_CONJ == B_CONJ + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#else + #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";" + #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";" +#endif +/* expanded accumulators for m4n1 and m4n2 */ +#define KERNEL_k1m4n1 \ + "vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\ + "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\ + "addq $64,%0;" +#define KERNEL_k1m4n2 \ + "vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\ + "vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\ + "vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\ + "addq $64,%0;" +/* contracted accumulators for m4n4 and m4n6 */ +#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \ + "vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\ + "vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc) +#define KERNEL_1_k1m4n4 \ + "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\ + acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1) +#define KERNEL_2_k1m4n4 \ + "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\ + acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1) +#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2) +#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2) +#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;" +#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;" +#define zero_4ymm(no1,no2,no3,no4) \ + "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\ + "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";" +/* initialization and storage macros */ +#define INIT_m4n1 zero_4ymm(4,5,6,7) +#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11) +#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15) +#if A_CONJ == B_CONJ + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";" +#else + #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";" +#endif +#if A_CONJ == 0 + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\ + "vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");" +#else + #define save_1ymm(c,tmp,off,alpr,alpi,...) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\ + "vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");" +#endif +#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3) +#define SAVE_m4n2 SAVE_m4n1\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1) +#define SAVE_m4n4 save_init_m4\ + save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\ + save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1) +#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\ + save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1) +#define COMPUTE_m4(ndim) \ + "movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\ + "movq $10,%5; movq $84,%%r15;"\ + #ndim"4441:\n\t"\ + "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\ + "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\ + "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\ + "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\ + #ndim"4442:\n\t"\ + "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\ + KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\ + #ndim"4443:\n\t"\ + "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim + +/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */ +#define KERNEL_k1m2n1 \ + "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\ + "vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;" +#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \ + "vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\ + "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r) +#define KERNEL_h_k1m2n2 \ + "vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1) +#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1) +#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2) +#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;" +#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;" +#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;" +#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m2n2 zero_4ymm(4,5,6,7) +#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11) +#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15) +#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3) +#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1) +#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\ + cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1) +#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\ + cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1) +#define COMPUTE_m2(ndim) \ + "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"2222f;"\ + #ndim"2221:\n\t"\ + KERNEL_k1m2n##ndim\ + "decq %5; jnz "#ndim"2221b;"\ + #ndim"2222:\n\t"\ + SAVE_m2n##ndim + +/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */ +#if A_CONJ == B_CONJ + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#else + #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";" + #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";" +#endif +#define KERNEL_k1m1n1 \ + "vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\ + "vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n2 \ + "vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\ + "vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5) +#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7) +#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9) +#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;" +#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;" +#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;" +#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;" +#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;" +#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;" +#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;" +#if A_CONJ == B_CONJ + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";" +#else + #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";" +#endif +#if A_CONJ == 0 + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\ + "vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\ + "vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;" +#else + #define save_m1n1(c,tmp,alpr,alpi) \ + "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\ + "vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);" + #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \ + "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\ + "vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\ + "vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;" +#endif +#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;" +#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1) +#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1) +#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1) +#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1) +#define COMPUTE_m1(ndim) \ + "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\ + "testq %5,%5; jz "#ndim"1112f;"\ + #ndim"1111:\n\t"\ + KERNEL_k1m1n##ndim\ + "decq %5; jnz "#ndim"1111b;"\ + #ndim"1112:\n\t"\ + SAVE_m1n##ndim + +#define COMPUTE(ndim) {\ + b_pref = b_ptr + ndim * K *2;\ + __asm__ __volatile__ (\ + "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\ + "cmpq $4,%7; jb "#ndim"9992f;"\ + #ndim"9991:\n\t"\ + COMPUTE_m4(ndim)\ + "subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\ + #ndim"9992:\n\t"\ + "cmpq $2,%7; jb "#ndim"9993f;"\ + COMPUTE_m2(ndim) "subq $2,%7;"\ + #ndim"9993:\n\t"\ + "testq %7,%7; jz "#ndim"9994f;"\ + COMPUTE_m1(ndim)\ + #ndim"9994:\n\t"\ + "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\ + :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\ + ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\ + "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ + a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\ +} + +int __attribute__ ((noinline)) +CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC) +{ + if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0; + int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; +#if A_CONJ == B_CONJ + double const_val[2] = {-alphar, -alphai}; +#else + double const_val[2] = {alphar, alphai}; +#endif + int64_t M = (int64_t)m, K = (int64_t)k; + BLASLONG n_count = n; + double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B; + for(;n_count>5;n_count-=6) COMPUTE(6) + for(;n_count>3;n_count-=4) COMPUTE(4) + for(;n_count>1;n_count-=2) COMPUTE(2) + if(n_count>0) COMPUTE(1) + return 0; +} diff --git a/param.h b/param.h index d39fc4a1d..4084c781d 100644 --- a/param.h +++ b/param.h @@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 @@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288 @@ -1571,8 +1571,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 768 #define DGEMM_DEFAULT_P 512 -#define CGEMM_DEFAULT_P 384 -#define ZGEMM_DEFAULT_P 256 +#define CGEMM_DEFAULT_P 256 +#define ZGEMM_DEFAULT_P 192 #ifdef WINDOWS_ABI #define SGEMM_DEFAULT_Q 320 @@ -1581,8 +1581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_Q 384 #define DGEMM_DEFAULT_Q 256 #endif -#define CGEMM_DEFAULT_Q 192 -#define ZGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 #define SGEMM_DEFAULT_R sgemm_r #define DGEMM_DEFAULT_R 13824 @@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_Q 128 -#define CGEMM3M_DEFAULT_UNROLL_N 8 -#define CGEMM3M_DEFAULT_UNROLL_M 4 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_M 2 -#define CGEMM3M_DEFAULT_P 448 +#define CGEMM3M_DEFAULT_P 320 #define ZGEMM3M_DEFAULT_P 224 #define XGEMM3M_DEFAULT_P 112 -#define CGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_Q 320 #define ZGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_R 12288