Merge pull request #21 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2019-12-29 18:08:55 +01:00 committed by GitHub
commit 0257f26488
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
67 changed files with 10555 additions and 1889 deletions

View File

@ -171,3 +171,11 @@ In chronological order:
* [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes
* [2019-03-14] power9 dgemm/dtrmm kernel * [2019-03-14] power9 dgemm/dtrmm kernel
* [2019-04-29] power9 sgemm/strmm kernel * [2019-04-29] power9 sgemm/strmm kernel
* Jiachen Wang <https://github.com/wjc404>
* [2019-07-29] optimize AVX2 DGEMM
* [2019-10-20] AVX512 DGEMM kernel (4x8)
* [2019-11-06] optimize AVX512 SGEMM
* [2019-11-12] AVX512 CGEMM & ZGEMM kernels
* [2019-12-23] optimize AVX2 CGEMM and ZGEMM
* [2019-12-27] AVX2 CGEMM3M kernel

View File

@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif endif
ifeq ($(GCCVERSIONGTEQ9), 1)
ifeq ($(CORE), TSV110) ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif endif
endif

View File

@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
ifeq ($(GCCVERSIONGT4), 1) ifeq ($(GCCVERSIONGT4), 1)
# GCC Major version > 4 # GCC Major version > 4
@ -547,9 +548,14 @@ endif
ifeq ($(ARCH), arm64) ifeq ($(ARCH), arm64)
DYNAMIC_CORE = ARMV8 DYNAMIC_CORE = ARMV8
DYNAMIC_CORE += CORTEXA53
DYNAMIC_CORE += CORTEXA57 DYNAMIC_CORE += CORTEXA57
DYNAMIC_CORE += CORTEXA72
DYNAMIC_CORE += CORTEXA73
DYNAMIC_CORE += FALKOR
DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX
DYNAMIC_CORE += THUNDERX2T99 DYNAMIC_CORE += THUNDERX2T99
DYNAMIC_CORE += TSV110
endif endif
ifeq ($(ARCH), power) ifeq ($(ARCH), power)

View File

@ -45,7 +45,11 @@ endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (ARM64) if (ARM64)
set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99) set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
endif ()
if (POWER)
set(DYNAMIC_CORE POWER6 POWER8 POWER9)
endif () endif ()
if (X86) if (X86)

View File

@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
set(ZGEMM_UNROLL_M 4) set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4) set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16) set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "TSV110")
file(APPEND ${TARGET_CONF_TEMP}
"#define ARMV8\n"
"#define L1_CODE_SIZE\t65536\n"
"#define L1_CODE_LINESIZE\t64\n"
"#define L1_CODE_ASSOCIATIVE\t4\n"
"#define L1_DATA_SIZE\t65536\n"
"#define L1_DATA_LINESIZE\t64\n"
"#define L1_DATA_ASSOCIATIVE\t4\n"
"#define L2_SIZE\t524288\n"
"#define L2_LINESIZE\t64\n"
"#define L2_ASSOCIATIVE\t8\n"
"#define DTB_DEFAULT_ENTRIES\t64\n"
"#define DTB_SIZE\t4096\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 8)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 4)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 16)
elseif ("${TCORE}" STREQUAL "POWER6")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 4)
set(SGEMM_UNROLL_N 4)
set(DGEMM_UNROLL_M 4)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 2)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 2)
set(ZGEMM_UNROLL_N 4)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER8")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
elseif ("${TCORE}" STREQUAL "POWER9")
file(APPEND ${TARGET_CONF_TEMP}
"#define L1_DATA_SIZE 32768\n"
"#define L1_DATA_LINESIZE 128\n"
"#define L2_SIZE 524288\n"
"#define L2_LINESIZE 128 \n"
"#define DTB_DEFAULT_ENTRIES 128\n"
"#define DTB_SIZE 4096\n"
"#define L2_ASSOCIATIVE 8\n")
set(SGEMM_UNROLL_M 16)
set(SGEMM_UNROLL_N 8)
set(DGEMM_UNROLL_M 16)
set(DGEMM_UNROLL_N 4)
set(CGEMM_UNROLL_M 8)
set(CGEMM_UNROLL_N 4)
set(ZGEMM_UNROLL_M 8)
set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8)
endif() endif()
# Or should this actually be NUM_CORES? # Or should this actually be NUM_CORES?

View File

@ -39,6 +39,35 @@
#ifndef COMMON_POWER #ifndef COMMON_POWER
#define COMMON_POWER #define COMMON_POWER
#define str(x) #x
#ifdef OS_AIX
#define XXSPLTD(T,A,z) xxpermdi T, A, A, 0b##z##z
#define XXMRGHD(T,A,B) xxpermdi T, A, B, 0b00
#define XXMRGLD(T,A,B) xxpermdi T, A, B, 0b11
#define XXSWAPD(T,A) xxpermdi T, A, A, 0b10
#define XVMOVDP(T,A) xvcpsgndp T, A, A
#define XXSPLTD_S(T,A,z) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) " \n\t"
#define XXMRGHD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b00 \n\t"
#define XXMRGLD_S(T,A,B) "xxpermdi " str(T) ", " str(A) ", " str(B) ", 0b11 \n\t"
#define XXSWAPD_S(T,A) "xxpermdi " str(T) ", " str(A) ", " str(A) ", 0b10 \n\t"
#else
#define XXSPLTD(T,A,z) xxspltd T, A, z
#define XXMRGHD(T,A,B) xxmrghd T, A, B
#define XXMRGLD(T,A,B) xxmrgld T, A, B
#define XXSWAPD(T,A) xxswapd T, A
#define XVMOVDP(T,A) xvmovdp T, A
#define XXSPLTD_S(T,A,z) "xxspltd " str(T) ", " str(A) ", " str(z)" \n\t"
#define XXMRGHD_S(T,A,B) "xxmrghd " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXMRGLD_S(T,A,B) "xxmrgld " str(T) ", " str(A) ", " str(B)" \n\t"
#define XXSWAPD_S(T,A) "xxswapd " str(T) ", " str(A) " \n\t"
#endif
#if defined(POWER8) || defined(POWER9) #if defined(POWER8) || defined(POWER9)
#define MB __asm__ __volatile__ ("eieio":::"memory") #define MB __asm__ __volatile__ ("eieio":::"memory")
#define WMB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory")

View File

@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();
@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N; if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
START_RPCC(); START_RPCC();

View File

@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
for(i = 0; i < blas_num_threads - 1; i++){ for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects // Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000); DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE #ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP

View File

@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
} }
return NULL; return NULL;
case 7: case 7:
if (model == 10) // Goldmont Plus
return &gotoblas_NEHALEM;
if (model == 14) { if (model == 14) {
// Ice Lake // Ice Lake
if (support_avx512()) if (support_avx512())

View File

@ -43,13 +43,18 @@
#endif #endif
extern gotoblas_t gotoblas_ARMV8; extern gotoblas_t gotoblas_ARMV8;
extern gotoblas_t gotoblas_CORTEXA53;
extern gotoblas_t gotoblas_CORTEXA57; extern gotoblas_t gotoblas_CORTEXA57;
extern gotoblas_t gotoblas_CORTEXA72;
extern gotoblas_t gotoblas_CORTEXA73;
extern gotoblas_t gotoblas_FALKOR;
extern gotoblas_t gotoblas_THUNDERX; extern gotoblas_t gotoblas_THUNDERX;
extern gotoblas_t gotoblas_THUNDERX2T99; extern gotoblas_t gotoblas_THUNDERX2T99;
extern gotoblas_t gotoblas_TSV110;
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 4 #define NUM_CORETYPES 9
/* /*
* In case asm/hwcap.h is outdated on the build system, make sure * In case asm/hwcap.h is outdated on the build system, make sure
@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
static char *corename[] = { static char *corename[] = {
"armv8", "armv8",
"cortexa53",
"cortexa57", "cortexa57",
"cortexa72",
"cortexa73",
"falkor",
"thunderx", "thunderx",
"thunderx2t99", "thunderx2t99",
"tsv110",
"unknown" "unknown"
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_ARMV8) return corename[ 0]; if (gotoblas == &gotoblas_ARMV8) return corename[ 0];
if (gotoblas == &gotoblas_CORTEXA57) return corename[ 1]; if (gotoblas == &gotoblas_CORTEXA53) return corename[ 1];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 2]; if (gotoblas == &gotoblas_CORTEXA57) return corename[ 2];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3]; if (gotoblas == &gotoblas_CORTEXA72) return corename[ 3];
if (gotoblas == &gotoblas_CORTEXA73) return corename[ 4];
if (gotoblas == &gotoblas_FALKOR) return corename[ 5];
if (gotoblas == &gotoblas_THUNDERX) return corename[ 6];
if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
if (gotoblas == &gotoblas_TSV110) return corename[ 8];
return corename[NUM_CORETYPES]; return corename[NUM_CORETYPES];
} }
@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
switch (found) switch (found)
{ {
case 0: return (&gotoblas_ARMV8); case 0: return (&gotoblas_ARMV8);
case 1: return (&gotoblas_CORTEXA57); case 1: return (&gotoblas_CORTEXA53);
case 2: return (&gotoblas_THUNDERX); case 2: return (&gotoblas_CORTEXA57);
case 3: return (&gotoblas_THUNDERX2T99); case 3: return (&gotoblas_CORTEXA72);
case 4: return (&gotoblas_CORTEXA73);
case 5: return (&gotoblas_FALKOR);
case 6: return (&gotoblas_THUNDERX);
case 7: return (&gotoblas_THUNDERX2T99);
case 8: return (&gotoblas_TSV110);
} }
snprintf(message, 128, "Core not found: %s\n", coretype); snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message); openblas_warning(1, message);
@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
case 0x41: // ARM case 0x41: // ARM
switch (part) switch (part)
{ {
case 0xd07: // Cortex A57
case 0xd08: // Cortex A72
case 0xd03: // Cortex A53 case 0xd03: // Cortex A53
return &gotoblas_CORTEXA53;
case 0xd07: // Cortex A57
return &gotoblas_CORTEXA57; return &gotoblas_CORTEXA57;
case 0xd08: // Cortex A72
return &gotoblas_CORTEXA72;
case 0xd09: // Cortex A73
return &gotoblas_CORTEXA73;
} }
break; break;
case 0x42: // Broadcom case 0x42: // Broadcom
@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_THUNDERX2T99; return &gotoblas_THUNDERX2T99;
} }
break; break;
case 0x48: // HiSilicon
switch (part)
{
case 0xd01: // tsv110
return &gotoblas_TSV110;
}
break;
case 0x51: // Qualcomm
switch (part)
{
case 0xc00: // Falkor
return &gotoblas_FALKOR;
}
break;
} }
return NULL; return NULL;
} }

897
dynamic.c
View File

@ -1,897 +0,0 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include "common.h"
#ifdef _MSC_VER
#define strncasecmp _strnicmp
#define strcasecmp _stricmp
#endif
#ifdef ARCH_X86
#define EXTERN extern
#else
#define EXTERN
#endif
#ifdef DYNAMIC_LIST
extern gotoblas_t gotoblas_PRESCOTT;
#ifdef DYN_ATHLON
extern gotoblas_t gotoblas_ATHLON;
#else
#define gotoblas_ATHLON gotoblas_PRESCOTT
#endif
#ifdef DYN_KATMAI
extern gotoblas_t gotoblas_KATMAI;
#else
#define gotoblas_KATMAI gotoblas_PRESCOTT
#endif
#ifdef DYN_BANIAS
extern gotoblas_t gotoblas_BANIAS;
#else
#define gotoblas_BANIAS gotoblas_PRESCOTT
#endif
#ifdef DYN_COPPERMINE
extern gotoblas_t gotoblas_COPPERMINE;
#else
#define gotoblas_COPPERMINE gotoblas_PRESCOTT
#endif
#ifdef DYN_NORTHWOOD
extern gotoblas_t gotoblas_NORTHWOOD;
#else
#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
#endif
#ifdef DYN_CORE2
extern gotoblas_t gotoblas_CORE2;
#else
#define gotoblas_CORE2 gotoblas_PRESCOTT
#endif
#ifdef DYN_NEHALEM
extern gotoblas_t gotoblas_NEHALEM;
#else
#define gotoblas_NEHALEM gotoblas_PRESCOTT
#endif
#ifdef DYN_BARCELONA
extern gotoblas_t gotoblas_BARCELONA;
#elif defined(DYN_NEHALEM)
#define gotoblas_BARCELONA gotoblas_NEHALEM
#else
#define gotoblas_BARCELONA gotoblas_PRESCOTT
#endif
#ifdef DYN_ATOM
extern gotoblas_t gotoblas_ATOM;
elif defined(DYN_NEHALEM)
#define gotoblas_ATOM gotoblas_NEHALEM
#else
#define gotoblas_ATOM gotoblas_PRESCOTT
#endif
#ifdef DYN_NANO
extern gotoblas_t gotoblas_NANO;
#else
#define gotoblas_NANO gotoblas_PRESCOTT
#endif
#ifdef DYN_PENRYN
extern gotoblas_t gotoblas_PENRYN;
#else
#define gotoblas_PENRYN gotoblas_PRESCOTT
#endif
#ifdef DYN_DUNNINGTON
extern gotoblas_t gotoblas_DUNNINGTON;
#else
#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON
extern gotoblas_t gotoblas_OPTERON;
#else
#define gotoblas_OPTERON gotoblas_PRESCOTT
#endif
#ifdef DYN_OPTERON_SSE3
extern gotoblas_t gotoblas_OPTERON_SSE3;
#else
#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
#endif
#ifdef DYN_BOBCAT
extern gotoblas_t gotoblas_BOBCAT;
#elif defined(DYN_NEHALEM)
#define gotoblas_BOBCAT gotoblas_NEHALEM
#else
#define gotoblas_BOBCAT gotoblas_PRESCOTT
#endif
#ifdef DYN_SANDYBRIDGE
extern gotoblas_t gotoblas_SANDYBRIDGE;
#elif defined(DYN_NEHALEM)
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#else
#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
#endif
#ifdef DYN_BULLDOZER
extern gotoblas_t gotoblas_BULLDOZER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_BULLDOZER gotoblas_NEHALEM
#else
#define gotoblas_BULLDOZER gotoblas_PRESCOTT
#endif
#ifdef DYN_PILEDRIVER
extern gotoblas_t gotoblas_PILEDRIVER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_PILEDRIVER gotoblas_NEHALEM
#else
#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
#endif
#ifdef DYN_STEAMROLLER
extern gotoblas_t gotoblas_STEAMROLLER;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_STEAMROLLER gotoblas_NEHALEM
#else
#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
#endif
#ifdef DYN_EXCAVATOR
extern gotoblas_t gotoblas_EXCAVATOR;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_EXCAVATOR gotoblas_NEHALEM
#else
#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
#endif
#ifdef DYN_HASWELL
extern gotoblas_t gotoblas_HASWELL;
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_HASWELL gotoblas_NEHALEM
#else
#define gotoblas_HASWELL gotoblas_PRESCOTT
#endif
#ifdef DYN_ZEN
extern gotoblas_t gotoblas_ZEN;
#elif defined(DYN_HASWELL)
#define gotoblas_ZEN gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_ZEN gotoblas_NEHALEM
#else
#define gotoblas_ZEN gotoblas_PRESCOTT
#endif
#ifdef DYN_SKYLAKEX
extern gotoblas_t gotoblas_SKYLAKEX;
#elif defined(DYN_HASWELL)
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST
EXTERN gotoblas_t gotoblas_KATMAI;
EXTERN gotoblas_t gotoblas_COPPERMINE;
EXTERN gotoblas_t gotoblas_NORTHWOOD;
EXTERN gotoblas_t gotoblas_BANIAS;
EXTERN gotoblas_t gotoblas_ATHLON;
extern gotoblas_t gotoblas_PRESCOTT;
extern gotoblas_t gotoblas_CORE2;
extern gotoblas_t gotoblas_NEHALEM;
extern gotoblas_t gotoblas_BARCELONA;
#ifdef DYNAMIC_OLDER
extern gotoblas_t gotoblas_ATOM;
extern gotoblas_t gotoblas_NANO;
extern gotoblas_t gotoblas_PENRYN;
extern gotoblas_t gotoblas_DUNNINGTON;
extern gotoblas_t gotoblas_OPTERON;
extern gotoblas_t gotoblas_OPTERON_SSE3;
extern gotoblas_t gotoblas_BOBCAT;
#else
#define gotoblas_ATOM gotoblas_NEHALEM
#define gotoblas_NANO gotoblas_NEHALEM
#define gotoblas_PENRYN gotoblas_CORE2
#define gotoblas_DUNNINGTON gotoblas_CORE2
#define gotoblas_OPTERON gotoblas_CORE2
#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
#define gotoblas_BOBCAT gotoblas_CORE2
#endif
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_STEAMROLLER;
extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else
extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX;
#else
#define gotoblas_SKYLAKEX gotoblas_HASWELL
#endif
#endif
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
#define gotoblas_EXCAVATOR gotoblas_BARCELONA
#define gotoblas_ZEN gotoblas_BARCELONA
#endif
#endif // DYNAMIC_LIST
#define VENDOR_INTEL 1
#define VENDOR_AMD 2
#define VENDOR_CENTAUR 3
#define VENDOR_HYGON 4
#define VENDOR_UNKNOWN 99
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
#ifndef NO_AVX
static inline void xgetbv(int op, int * eax, int * edx){
//Use binary code for xgetbv
__asm__ __volatile__
(".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
}
#endif
int support_avx(){
#ifndef NO_AVX
int eax, ebx, ecx, edx;
int ret=0;
cpuid(1, &eax, &ebx, &ecx, &edx);
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 6) == 6){
ret=1; //OS support AVX
}
}
return ret;
#else
return 0;
#endif
}
int support_avx2(){
#ifndef NO_AVX2
int eax, ebx, ecx=0, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0)
ret=1; //OS supports AVX2
return ret;
#else
return 0;
#endif
}
int support_avx512(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx())
return 0;
cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 1){
ret=0; //OS does not even support AVX2
}
if((ebx & (1<<31)) != 0){
xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
static int get_vendor(void){
int eax, ebx, ecx, edx;
union
{
char vchar[16];
int vint[4];
} vendor;
cpuid(0, &eax, &ebx, &ecx, &edx);
*(&vendor.vint[0]) = ebx;
*(&vendor.vint[1]) = edx;
*(&vendor.vint[2]) = ecx;
vendor.vchar[12] = '\0';
if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
return VENDOR_UNKNOWN;
}
static gotoblas_t *get_coretype(void){
int eax, ebx, ecx, edx;
int family, exfamily, model, vendor, exmodel;
cpuid(1, &eax, &ebx, &ecx, &edx);
family = BITMASK(eax, 8, 0x0f);
exfamily = BITMASK(eax, 20, 0xff);
model = BITMASK(eax, 4, 0x0f);
exmodel = BITMASK(eax, 16, 0x0f);
vendor = get_vendor();
if (vendor == VENDOR_INTEL){
switch (family) {
case 0x6:
switch (exmodel) {
case 0:
if (model <= 0x7) return &gotoblas_KATMAI;
if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
if (model == 14) return &gotoblas_BANIAS;
if (model == 15) return &gotoblas_CORE2;
return NULL;
case 1:
if (model == 6) return &gotoblas_CORE2;
if (model == 7) return &gotoblas_PENRYN;
if (model == 13) return &gotoblas_DUNNINGTON;
if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
if (model == 12) return &gotoblas_ATOM;
return NULL;
case 2:
//Intel Core (Clarkdale) / Core (Arrandale)
// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
// Xeon (Clarkdale), 32nm
if (model == 5) return &gotoblas_NEHALEM;
//Intel Xeon Processor 5600 (Westmere-EP)
//Xeon Processor E7 (Westmere-EX)
//Xeon E7540
if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
//Intel Core i7-3000 / Xeon E5
if (model == 10 || model == 13) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
case 3:
//Intel Sandy Bridge 22nm (Ivy Bridge?)
if (model == 10 || model == 14) {
if(support_avx())
return &gotoblas_SANDYBRIDGE;
else{
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Haswell
if (model == 12 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 13) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 7) return &gotoblas_ATOM; //Bay Trail
return NULL;
case 4:
//Intel Haswell
if (model == 5 || model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Broadwell
if (model == 7 || model == 15) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Braswell / Avoton
if (model == 12 || model == 13) {
return &gotoblas_NEHALEM;
}
return NULL;
case 5:
//Intel Broadwell
if (model == 6) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
if (model == 5) {
// Intel Skylake X
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
//Intel Skylake
if (model == 14) {
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Intel Phi Knights Landing
if (model == 7) {
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;
case 6:
if (model == 6) {
// Cannon Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 7:
if (model == 10) // Goldmont plus
return &gotoblas_NEHALEM;
if (model == 14) {
// Ice Lake
if (support_avx512())
return &gotoblas_SKYLAKEX;
if(support_avx2()){
openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
return &gotoblas_HASWELL;
}
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM;
}
}
return NULL;
case 9:
case 8:
if (model == 14 ) { // Kaby Lake, Coffee Lake
if(support_avx2())
return &gotoblas_HASWELL;
if(support_avx()) {
openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
return &gotoblas_SANDYBRIDGE;
} else {
openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
return NULL;
}
case 0xf:
if (model <= 0x2) return &gotoblas_NORTHWOOD;
return &gotoblas_PRESCOTT;
}
}
if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
if (family <= 0xe) {
// Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
if ( (eax & 0xffff) >= 0x01) {
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
return NULL;
}
else
return NULL;
return &gotoblas_ATHLON;
}
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) {
if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3;
else return &gotoblas_OPTERON;
} else if (exfamily == 5) {
return &gotoblas_BOBCAT;
} else if (exfamily == 6) {
if(model == 1){
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
if(support_avx())
return &gotoblas_BULLDOZER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 2 || model == 3){
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 5){
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if(model == 0 || model == 8){
if (exmodel == 1) {
//AMD Trinity
if(support_avx())
return &gotoblas_PILEDRIVER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 3) {
//AMD STEAMROLLER
if(support_avx())
return &gotoblas_STEAMROLLER;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else if (exmodel == 6) {
if(support_avx())
return &gotoblas_EXCAVATOR;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
}
} else if (exfamily == 8) {
if (model == 1 || model == 8) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}
} else if (exfamily == 9) {
if(support_avx())
return &gotoblas_ZEN;
else{
openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
}
}else {
return &gotoblas_BARCELONA;
}
}
}
if (vendor == VENDOR_CENTAUR) {
switch (family) {
case 0x6:
return &gotoblas_NANO;
}
}
return NULL;
}
static char *corename[] = {
"Unknown",
"Katmai",
"Coppermine",
"Northwood",
"Prescott",
"Banias",
"Atom",
"Core2",
"Penryn",
"Dunnington",
"Nehalem",
"Athlon",
"Opteron",
"Opteron_SSE3",
"Barcelona",
"Nano",
"Sandybridge",
"Bobcat",
"Bulldozer",
"Piledriver",
"Haswell",
"Steamroller",
"Excavator",
"Zen",
"SkylakeX"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_KATMAI) return corename[ 1];
if (gotoblas == &gotoblas_COPPERMINE) return corename[ 2];
if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3];
if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4];
if (gotoblas == &gotoblas_BANIAS) return corename[ 5];
if (gotoblas == &gotoblas_ATOM) return corename[ 6];
if (gotoblas == &gotoblas_CORE2) return corename[ 7];
if (gotoblas == &gotoblas_PENRYN) return corename[ 8];
if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9];
if (gotoblas == &gotoblas_NEHALEM) return corename[10];
if (gotoblas == &gotoblas_ATHLON) return corename[11];
if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
if (gotoblas == &gotoblas_OPTERON) return corename[13];
if (gotoblas == &gotoblas_BARCELONA) return corename[14];
if (gotoblas == &gotoblas_NANO) return corename[15];
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
if (gotoblas == &gotoblas_HASWELL) return corename[20];
if (gotoblas == &gotoblas_STEAMROLLER) return corename[21];
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
return corename[0];
}
static gotoblas_t *force_coretype(char *coretype){
int i ;
int found = -1;
char message[128];
//char mname[20];
for ( i=1 ; i <= 24; i++)
{
if (!strncasecmp(coretype,corename[i],20))
{
found = i;
break;
}
}
if (found < 0)
{
//strncpy(mname,coretype,20);
snprintf(message, 128, "Core not found: %s\n",coretype);
openblas_warning(1, message);
return(NULL);
}
switch (found)
{
case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR);
case 21: return (&gotoblas_STEAMROLLER);
case 20: return (&gotoblas_HASWELL);
case 19: return (&gotoblas_PILEDRIVER);
case 18: return (&gotoblas_BULLDOZER);
case 17: return (&gotoblas_BOBCAT);
case 16: return (&gotoblas_SANDYBRIDGE);
case 15: return (&gotoblas_NANO);
case 14: return (&gotoblas_BARCELONA);
case 13: return (&gotoblas_OPTERON);
case 12: return (&gotoblas_OPTERON_SSE3);
case 11: return (&gotoblas_ATHLON);
case 10: return (&gotoblas_NEHALEM);
case 9: return (&gotoblas_DUNNINGTON);
case 8: return (&gotoblas_PENRYN);
case 7: return (&gotoblas_CORE2);
case 6: return (&gotoblas_ATOM);
case 5: return (&gotoblas_BANIAS);
case 4: return (&gotoblas_PRESCOTT);
case 3: return (&gotoblas_NORTHWOOD);
case 2: return (&gotoblas_COPPERMINE);
case 1: return (&gotoblas_KATMAI);
}
return(NULL);
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
#ifdef ARCH_X86
if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
#else
if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
/* sanity check, if 64bit pointer we can't have a 32 bit cpu */
if (sizeof(void*) == 8) {
if (gotoblas == &gotoblas_KATMAI ||
gotoblas == &gotoblas_COPPERMINE ||
gotoblas == &gotoblas_NORTHWOOD ||
gotoblas == &gotoblas_BANIAS ||
gotoblas == &gotoblas_ATHLON)
gotoblas = &gotoblas_PRESCOTT;
}
#endif
if (gotoblas && gotoblas -> init) {
strncpy(coren,gotoblas_corename(),20);
sprintf(coremsg, "Core: %s\n",coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@ -71,7 +71,7 @@ if ($compiler eq "") {
if ($data =~ /GNU/) { if ($data =~ /GNU/) {
$data =~ /(\d)\.(\d).(\d)/; $data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1; $major = $1;
$minor = $2; $minor = $2;

View File

@ -1,4 +1,5 @@
USE_GEMM3M = 0 USE_GEMM3M = 0
OS := $(shell uname)
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
USE_GEMM3M = 1 USE_GEMM3M = 1
@ -59,8 +60,6 @@ USE_TRMM = 1
endif endif
SKERNELOBJS += \ SKERNELOBJS += \
sgemm_kernel$(TSUFFIX).$(SUFFIX) \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@ -438,7 +437,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s
m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
rm sgemmotcopy.s sgemmotcopy_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
@ -446,12 +453,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s
m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
rm sgemmitcopy.s sgemmitcopy_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
endif endif
$(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY) $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@ -462,7 +483,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
endif
endif endif
@ -498,7 +526,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
endif endif
@ -514,7 +549,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
endif
endif endif
@ -539,37 +581,107 @@ endif
endif endif
$(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s
m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
endif
$(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
$(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
endif
$(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
endif
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
endif
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
endif
$(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
endif
$(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
endif
$(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
endif
$(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
else
$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
endif
$(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
@ -586,28 +698,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
ifdef USE_TRMM ifdef USE_TRMM
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s
m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s
m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s
m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -622,52 +790,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s
m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@
rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
endif
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
endif
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
else else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -679,7 +960,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -806,7 +1094,14 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
$(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
else
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
endif
$(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND) $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@ $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
@ -1942,7 +2237,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
endif endif
$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) $(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
$(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
@ -2046,7 +2341,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
$(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ ifeq ($(OS), AIX)
$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
else
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
endif
$(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
@ -2085,7 +2387,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s
m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
else
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@

View File

@ -102,6 +102,8 @@ CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S DSDOTKERNEL = dot.S
DGEMM_BETA = dgemm_beta.S
SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))

178
kernel/arm64/dgemm_beta.S Normal file
View File

@ -0,0 +1,178 @@
/***************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M x0
#define N x1
#define BETA d0
#define LDC x6
#define C00 x7
#define A01 x8
#define A02 x9
#define A03 x10
#define A04 x11
#define beta0 d11
#define betaV0 v11.d[0]
#define I x16
#define size 128
/**************************************************************************************
* Macro definitions
**************************************************************************************/
.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm
.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm
/**************************************************************************************
* End of macro definitions
**************************************************************************************/
PROLOGUE
.align 5
ldr LDC, [sp]
SAVE_REGS
.Lgemm_beta_BEGIN:
fmov beta0, BETA
cmp N, #0
ble .Lgemm_beta_L999
.Lgemm_beta_01:
lsl LDC, LDC, #3
.align 5
.Lgemm_beta_02:
mov A01, C00
add C00, C00, LDC
asr I, M, #4
cmp I, #0
ble .Lgemm_beta_04
add A02, A01, #32
add A03, A02, #32
add A04, A03, #32
.align 5
.Lgemm_beta_03:
ldp q0, q1, [A01]
ldp q2, q3, [A02]
ldp q4, q5, [A03]
ldp q6, q7, [A04]
fmul v0.2d, v0.2d, betaV0
fmul v1.2d, v1.2d, betaV0
fmul v2.2d, v2.2d, betaV0
fmul v3.2d, v3.2d, betaV0
fmul v4.2d, v4.2d, betaV0
fmul v5.2d, v5.2d, betaV0
fmul v6.2d, v6.2d, betaV0
fmul v7.2d, v7.2d, betaV0
st1 {v0.2d, v1.2d}, [A01]
add A01, A01, size
st1 {v2.2d, v3.2d}, [A02]
add A02, A02, size
st1 {v4.2d, v5.2d}, [A03]
add A03, A03, size
st1 {v6.2d, v7.2d}, [A04]
add A04, A04, size
subs I , I , #1
bne .Lgemm_beta_03
.align 5
.Lgemm_beta_04:
and I, M , #15 // M%16
cmp I, #0
ble .Lgemm_beta_06
.align 5
.Lgemm_beta_05:
ldr d12, [A01]
fmul d12, d12, beta0
str d12, [A01]
add A01, A01, #8
subs I , I , #1
bne .Lgemm_beta_05
.align 5
.Lgemm_beta_06:
subs N , N, #1 // N--
bne .Lgemm_beta_02
.align 5
.Lgemm_beta_L999:
mov x0, #0
RESTORE_REGS
ret
EPILOGUE

View File

@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"

View File

@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1 stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1 stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 64 \n\t" "addi %[x_ptr], %[x_ptr], 64 \n\t"
"addi %[y_ptr], %[y_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"
@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
"addi %[x_ptr], %[x_ptr], 128 \n\t" "addi %[x_ptr], %[x_ptr], 128 \n\t"
"addi %[y_ptr], %[y_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -8 \n\t" "addic. %[temp_n], %[temp_n], -8 \n\t"
"bgt 1b \n\t" "bgt one%= \n\t"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
"xvmulsp 42, 34, 36 \n\t" "xvmulsp 42, 34, 36 \n\t"

View File

@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
__asm__ __asm__
( (
"xxspltd %x4, %x22, 0 \n\t" XXSPLTD_S(%x4,%x22,0)
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t" "dcbt 0, %3 \n\t"
@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t"
@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
"addi %3, %3, -64 \n\t" "addi %3, %3, -64 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp %x13, %x5, %x4 \n\t" "xvmaddadp %x13, %x5, %x4 \n\t"
"xvmaddadp %x14, %x6, %x4 \n\t" "xvmaddadp %x14, %x6, %x4 \n\t"

View File

@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" "xvmaddadp 32, 40, 48 \n\t"
"xvmaddadp 33, 41, 49 \n\t" "xvmaddadp 33, 41, 49 \n\t"
@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o0, A1 lxvd2x vs1, o0, A1
@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 128 addi BO, BO, 128
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16 addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 64 addi BO, BO, 64
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
lxvd2x vs1, o16, A0 lxvd2x vs1, o16, A0
@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 32 addi BO, BO, 32
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs0, o0, A0 lxvd2x vs0, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 16 addi BO, BO, 16
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsdx vs0, o0, A0 lxsdx vs0, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
addi BO, BO, 8 addi BO, BO, 8
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs10, o32, T1 stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1 stxvd2x vs11, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs35, o8, T1 stxsdx vs35, o8, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs33, o8, T1 stxsdx vs33, o8, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsdx vs32, o0, A0 lxsdx vs32, o0, A0
addi A0, A0, 8 addi A0, A0, 8
@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsdx vs32, o0, T1 stxsdx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
( (
"lxvd2x 34, 0, %10 \n\t" // x0, x1 "lxvd2x 34, 0, %10 \n\t" // x0, x1
"lxvd2x 35, %11, %10 \n\t" // x2, x3 "lxvd2x 35, %11, %10 \n\t" // x2, x3
"xxspltd 32, %x9, 0 \n\t" // alpha, alpha XXSPLTD_S(32,%x9,0) // alpha, alpha
"sldi %6, %13, 3 \n\t" // lda * sizeof (double) "sldi %6, %13, 3 \n\t" // lda * sizeof (double)
@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda
"add %6, %6, %6 \n\t" // 2 * lda "add %6, %6, %6 \n\t" // 2 * lda
"xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha XXSPLTD_S(32,34,0) // x0 * alpha, x0 * alpha
"xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha XXSPLTD_S(33,34,1) // x1 * alpha, x1 * alpha
"xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha XXSPLTD_S(34,35,0) // x2 * alpha, x2 * alpha
"xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha XXSPLTD_S(35,35,1) // x3 * alpha, x3 * alpha
"add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
"add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %6, %6, 32 \n\t" "addi %6, %6, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 37, %11, %2 \n\t" // y2, y3
@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
"addi %2, %2, 32 \n\t" "addi %2, %2, 32 \n\t"
"addic. %1, %1, -4 \n\t" "addic. %1, %1, -4 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"lxvd2x 36, 0, %2 \n\t" // y0, y1 "lxvd2x 36, 0, %2 \n\t" // y0, y1
"lxvd2x 37, %11, %2 \n\t" // y2, y3 "lxvd2x 37, %11, %2 \n\t" // y2, y3

View File

@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"li %[off],32 \n\t" "li %[off],32 \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
//-------------------------------------------------- //--------------------------------------------------
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 49, %[a6], %[off2] \n\t" "lxvd2x 49, %[a6], %[off2] \n\t"
"lxvd2x 51, %[a7], %[off2] \n\t" "lxvd2x 51, %[a7], %[off2] \n\t"
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
"xvmaddadp 35,38,32 \n\t" "xvmaddadp 35,38,32 \n\t"
#if defined(PREFETCH) #if defined(PREFETCH)
@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"lxvd2x 33, %[x], %[off2] \n\t" "lxvd2x 33, %[x], %[off2] \n\t"
"addic. %[n],%[n],-4 \n\t" "addic. %[n],%[n],-4 \n\t"
"ble- 2f \n\t" "ble- two%= \n\t"
"addi %[off2], %[off2],32 \n\t" "addi %[off2], %[off2],32 \n\t"
#if defined(PREFETCH) #if defined(PREFETCH)
@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
#if defined(PREFETCH) #if defined(PREFETCH)
"dcbt %[temp],%[x] \n\t" "dcbt %[temp],%[x] \n\t"
#endif #endif
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"2: \n\t" "two%=: \n\t"
//-------------------------------------------- //--------------------------------------------
"xvmaddadp 34,36,32 \n\t" "xvmaddadp 34,36,32 \n\t"
@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xvmaddadp 7,46,32 \n\t" "xvmaddadp 7,46,32 \n\t"
"xvmaddadp 8,48,32 \n\t" "xvmaddadp 8,48,32 \n\t"
"xvmaddadp 9,50,32 \n\t" "xvmaddadp 9,50,32 \n\t"
"xxspltd 36, %x[alpha], 0 \n\t" XXSPLTD_S(36,%x[alpha],0)
"xvmaddadp 34,37,33 \n\t" "xvmaddadp 34,37,33 \n\t"
"xvmaddadp 35,39,33 \n\t" "xvmaddadp 35,39,33 \n\t"
"xvmaddadp 4,41,33 \n\t" "xvmaddadp 4,41,33 \n\t"
@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
"xxmrgld 42,34,35 \n\t" XXMRGLD_S(42,34,35)
"xxmrghd 43,34,35 \n\t" XXMRGHD_S(43,34,35)
"xxmrgld 44,4,5 \n\t" XXMRGLD_S(44,4,5)
"xxmrghd 45,4,5 \n\t" XXMRGHD_S(45,4,5)
"xvadddp 42,42,43 \n\t" "xvadddp 42,42,43 \n\t"
"xxmrgld 46,6,7 \n\t" XXMRGLD_S(46,6,7)
"xxmrghd 47,6,7 \n\t" XXMRGHD_S(47,6,7)
"xvadddp 44,44,45 \n\t" "xvadddp 44,44,45 \n\t"
"xxmrgld 48,8,9 \n\t" XXMRGLD_S(48,8,9)
"xxmrghd 49,8,9 \n\t" XXMRGHD_S(49,8,9)
"xvadddp 46,46,47 \n\t" "xvadddp 46,46,47 \n\t"

View File

@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
__asm__ __asm__
( (
"xxspltd 36, %x13, 0 \n\t" // load c to both dwords XXSPLTD_S(36,%x13,0) // load c to both dwords
"xxspltd 37, %x14, 0 \n\t" // load s to both dwords XXSPLTD_S(37,%x14,0) // load s to both dwords
"lxvd2x 32, 0, %3 \n\t" // load x "lxvd2x 32, 0, %3 \n\t" // load x
"lxvd2x 33, %15, %3 \n\t" "lxvd2x 33, %15, %3 \n\t"
@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 64 \n\t" "addi %4, %4, 64 \n\t"
"addic. %2, %2, -8 \n\t" "addic. %2, %2, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -8 \n\t" "addic. %2, %2, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"

View File

@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
( (
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"xxspltd %x3, %x3, 0 \n\t" XXSPLTD_S(%x3,%x3,0)
"lxvd2x 32, 0, %2 \n\t" "lxvd2x 32, 0, %2 \n\t"
"lxvd2x 33, %4, %2 \n\t" "lxvd2x 33, %4, %2 \n\t"
@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t" "xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t"
@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, %x3 \n\t" "xvmuldp 40, 32, %x3 \n\t"
"xvmuldp 41, 33, %x3 \n\t" "xvmuldp 41, 33, %x3 \n\t"
@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
( (
"xxlxor %x3, %x3, %x3 \n\t" "xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t"
@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :

View File

@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value "xxlxor 39,39,39 \n\t" // vs39 vec_max_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
"xvabsdp 44, 44 \n\t" "xvabsdp 44, 44 \n\t"
"xvabsdp 45, 45 \n\t" "xvabsdp 45, 45 \n\t"
@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
//=================================================================== //===================================================================
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger "vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39) //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)
"xvcmpgtdp 2, 3,39 \n\t" "xvcmpgtdp 2, 3,39 \n\t"
@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//<-----------jump here from first load //<-----------jump here from first load
"2: \n\t" "two%=: \n\t"
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 1,1,5 \n\t" // get real index for first bigger "vaddudm 1,1,5 \n\t" // get real index for first bigger
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -32 \n\t" "addic. %[n], %[n], -32 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
//============================================================================== //==============================================================================
"xvcmpgtdp 2,45,44 \n\t " "xvcmpgtdp 2,45,44 \n\t "
"xvcmpgtdp 3,47,46 \n\t " "xvcmpgtdp 3,47,46 \n\t "
"xvcmpgtdp 4,49,48 \n\t " "xvcmpgtdp 4,49,48 \n\t "
"xvcmpgtdp 5,51,50 \n\t" "xvcmpgtdp 5,7,6 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2, 1,0 \n\t" "xvcmpgtdp 2, 1,0 \n\t"
"xvcmpgtdp 3,47, 45 \n\t" "xvcmpgtdp 3,47, 45 \n\t"
@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector ///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t" "xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );

View File

@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8, %[adder] \n\t" //{3,2} vs41 "vaddudm 9,8, %[adder] \n\t" //{3,2} vs41
@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
"xvabsdp 39, 39 \n\t" "xvabsdp 39, 39 \n\t"
"xvabsdp 44, 44 \n\t" "xvabsdp 44, 44 \n\t"
@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
//=================================================================== //===================================================================
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller "vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39) //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)
"xvcmpgtdp 2,39, 3 \n\t" "xvcmpgtdp 2,39, 3 \n\t"
@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//<-----------jump here from first load //<-----------jump here from first load
"2: \n\t" "two%=: \n\t"
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 1,1,5 \n\t" // get real index for first smaller "vaddudm 1,1,5 \n\t" // get real index for first smaller
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -32 \n\t" "addic. %[n], %[n], -32 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
//============================================================================== //==============================================================================
"xvcmpgtdp 2,44,45 \n\t " "xvcmpgtdp 2,44,45 \n\t "
"xvcmpgtdp 3,46,47 \n\t " "xvcmpgtdp 3,46,47 \n\t "
"xvcmpgtdp 4,48,49 \n\t " "xvcmpgtdp 4,48,49 \n\t "
"xvcmpgtdp 5,50,51 \n\t" "xvcmpgtdp 5,6,7 \n\t"
"xxsel 32,40,41,2 \n\t" "xxsel 32,40,41,2 \n\t"
"xxsel 0,44,45,2 \n\t" "xxsel 0,44,45,2 \n\t"
@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xxsel 34,40,41,4 \n\t" "xxsel 34,40,41,4 \n\t"
"xxsel 45,48,49,4 \n\t" "xxsel 45,48,49,4 \n\t"
"xxsel 35,42,43,5 \n\t" "xxsel 35,42,43,5 \n\t"
"xxsel 47,50,51,5 \n\t" "xxsel 47,6,7,5 \n\t"
"xvcmpgtdp 2,0, 1 \n\t" "xvcmpgtdp 2,0, 1 \n\t"
"xvcmpgtdp 3, 45,47 \n\t" "xvcmpgtdp 3, 45,47 \n\t"
@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector ///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t" "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -56,8 +56,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -67,7 +67,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero "xxlxor 39,39,39 \n\t" // vs39 vec_max_value is zero
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
@ -77,24 +77,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -103,15 +103,15 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -133,8 +133,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -148,35 +148,35 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start //>>/////////////////////////////// half start
"2: \n\t" "two%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t" "xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t" "xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -198,8 +198,8 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -211,24 +211,24 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -16 \n\t" "addic. %[n], %[n], -16 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -237,13 +237,13 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
"xvcmpgtdp 50,47,46 \n\t " "xvcmpgtdp 6,47,46 \n\t "
"xvcmpgtdp 51,49,48 \n\t " "xvcmpgtdp 7,49,48 \n\t "
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,1,0 \n\t " "xvcmpgtdp 2,1,0 \n\t "
"xxsel 32,32,33,2 \n\t" "xxsel 32,32,33,2 \n\t"
@ -262,28 +262,28 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
///////extract max value and max index from vector ///////extract max value and max index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4, 40,39 \n\t" "xvcmpgtdp 4, 40,39 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_maxf] \n\t" "stxsdx 0,0,%[ptr_maxf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -292,7 +292,7 @@ static BLASLONG ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 47, %[i48],%[ptr_tmp] \n\t" "lxvd2x 47, %[i48],%[ptr_tmp] \n\t"
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
"xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8 "xxlor 40,%x[start],%x[start] \n\t" //{ 1,0} vs40 | v8
"vaddudm 9,8,%[adder] \n\t" //{3,2} vs41 "vaddudm 9,8,%[adder] \n\t" //{3,2} vs41
@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"vaddudm 11,10,%[adder] \n\t" //{7,6} vs43 "vaddudm 11,10,%[adder] \n\t" //{7,6} vs43
"lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value "lxvdsx 39,0,%[ptr_minf] \n\t" // vs39 vec_min_value
"vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4 "vaddudm 4,11, %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
"xxspltd 36,36,0 \n\t" XXSPLTD_S(36,36,0)
@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//jump first half forward //jump first half forward
"b 2f \n\t" "b two%= \n\t"
".p2align 5 \n\t" ".align 5 \n\t"
"1: \n\t" "one%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//>>/////////////////////////////// half start //>>/////////////////////////////// half start
"2: \n\t" "two%=: \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
"xvadddp 48, 4,5 \n\t" "xvadddp 48, 4,5 \n\t"
"xvadddp 49, 44,45 \n\t" "xvadddp 49, 44,45 \n\t"
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t" "addi %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"lxvd2x 44, 0,%[ptr_tmp] \n\t" "lxvd2x 44, 0,%[ptr_tmp] \n\t"
"lxvd2x 45, %[i16],%[ptr_tmp] \n\t" "lxvd2x 45, %[i16],%[ptr_tmp] \n\t"
@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"lxvd2x 48, %[i64],%[ptr_tmp] \n\t" "lxvd2x 48, %[i64],%[ptr_tmp] \n\t"
"lxvd2x 49, %[i80],%[ptr_tmp] \n\t" "lxvd2x 49, %[i80],%[ptr_tmp] \n\t"
"lxvd2x 50, %[i96],%[ptr_tmp] \n\t" "lxvd2x 6, %[i96],%[ptr_tmp] \n\t"
"lxvd2x 51,%[i112],%[ptr_tmp] \n\t" "lxvd2x 7,%[i112],%[ptr_tmp] \n\t"
//select with previous //select with previous
"xxsel 38,38,32,4 \n\t" "xxsel 38,38,32,4 \n\t"
"xxsel 39,39,3,4 \n\t" "xxsel 39,39,3,4 \n\t"
@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvabsdp 47, 47 \n\t" "xvabsdp 47, 47 \n\t"
"xvabsdp 48, 48 \n\t" "xvabsdp 48, 48 \n\t"
"xvabsdp 49, 49 \n\t" "xvabsdp 49, 49 \n\t"
"xvabsdp 50, 50 \n\t" "xvabsdp 6, 6 \n\t"
"xvabsdp 51, 51 \n\t" "xvabsdp 7, 7 \n\t"
//decrement n //decrement n
"addic. %[n], %[n], -16 \n\t" "addic. %[n], %[n], -16 \n\t"
//Loop back if >0 //Loop back if >0
"bgt+ 1b \n\t" "bgt+ one%= \n\t"
"xxmrghd 0,44,45 \n\t" XXMRGHD_S(0,44,45)
"xxmrgld 1,44,45 \n\t" XXMRGLD_S(1,44,45)
"xxmrghd 2,46,47 \n\t" XXMRGHD_S(2,46,47)
"xxmrgld 3,46,47 \n\t" XXMRGLD_S(3,46,47)
"xxmrghd 4,48,49 \n\t" XXMRGHD_S(4,48,49)
"xxmrgld 5,48,49 \n\t" XXMRGLD_S(5,48,49)
"xxmrghd 44,50,51 \n\t" XXMRGHD_S(44,6,7)
"xxmrgld 45,50,51 \n\t" XXMRGLD_S(45,6,7)
"xvadddp 46, 0,1 \n\t" "xvadddp 46, 0,1 \n\t"
"xvadddp 47, 2,3 \n\t" "xvadddp 47, 2,3 \n\t"
@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
"xvcmpgtdp 50,46,47 \n\t " "xvcmpgtdp 6,46,47 \n\t "
"xvcmpgtdp 51,48,49 \n\t " "xvcmpgtdp 7,48,49 \n\t "
"xxsel 32,40,41,50 \n\t" "xxsel 32,40,41,6 \n\t"
"xxsel 0,46,47,50 \n\t" "xxsel 0,46,47,6 \n\t"
"xxsel 33,42,43,51 \n\t" "xxsel 33,42,43,7 \n\t"
"xxsel 1,48,49,51 \n\t" "xxsel 1,48,49,7 \n\t"
"xvcmpgtdp 2,0,1 \n\t " "xvcmpgtdp 2,0,1 \n\t "
"xxsel 32,32,33,2 \n\t" "xxsel 32,32,33,2 \n\t"
@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
///////extract min value and min index from vector ///////extract min value and min index from vector
"xxspltd 32,38,1 \n\t" XXSPLTD_S(32,38,1)
"xxspltd 40,39,1 \n\t" XXSPLTD_S(40,39,1)
"xvcmpeqdp. 2, 40,39 \n\t" "xvcmpeqdp. 2, 40,39 \n\t"
//cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
//0b001110=14 //0b001110=14
"bc 14,24, 3f \n\t" "bc 14,24, three%= \n\t"
"xvcmpgtdp 4,39, 40 \n\t" "xvcmpgtdp 4,39, 40 \n\t"
"xxsel 0,39,40,4 \n\t" "xxsel 0,39,40,4 \n\t"
"xxsel 1,38,32,4 \n\t" "xxsel 1,38,32,4 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"b 4f \n\t" "b four%= \n\t"
"3: \n\t" "three%=: \n\t"
//if elements value are equal then choose minimum index //if elements value are equal then choose minimum index
"xxspltd 0,40,0 \n\t" XXSPLTD_S(0,40,0)
"vminud 0,0,6 \n\t" //vs32 vs38 "vminud 0,0,6 \n\t" //vs32 vs38
"xxlor 1,32,32 \n\t" "xxlor 1,32,32 \n\t"
"stxsdx 0,0,%[ptr_minf] \n\t" "stxsdx 0,0,%[ptr_minf] \n\t"
"4: \n\t" "four%=: \n\t"
"mfvsrd %[index],1 \n\t" "mfvsrd %[index],1 \n\t"
: [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n) : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
[i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112), [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
[start] "v"(start), [adder] "v"(temp_add_index) [start] "v"(start), [adder] "v"(temp_add_index)
: "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36", : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
"vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
); );
return index; return index;

View File

@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
" .machine \"any\" ;" " .machine \"any\" ;"
"0: lwarx %0,0, %1 ;" "0: lwarx %0,0, %1 ;"
" cmpwi 0,%0,0;" " cmpwi 0,%0,0;"
" bne 1f;" " bne one%=;"
" stwcx. %2,0, %1 ;" " stwcx. %2,0, %1 ;"
" bne- 0b;" " bne- 0b;"
"1: " "one%=: "
: "=&r"(ret) : "=&r"(ret)
: "r"(address), "r" (val) : "r"(address), "r" (val)
: "cr0", "memory"); : "cr0", "memory");

View File

@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"
@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
"xvaddsp 38, 38, %x5 \n\t" "xvaddsp 38, 38, %x5 \n\t"
"xvaddsp 39, 39, %x6 \n\t" "xvaddsp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabssp 48, 40 \n\t" "xvabssp 48, 40 \n\t"
"xvabssp 49, 41 \n\t" "xvabssp 49, 41 \n\t"

View File

@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"
@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 40, 0, %3 \n\t" "stxvd2x 40, 0, %3 \n\t"
"stxvd2x 41, %5, %3 \n\t" "stxvd2x 41, %5, %3 \n\t"

View File

@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"lxvd2x 40, 0, %2 \n\t" "lxvd2x 40, 0, %2 \n\t"
@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
"addi %3, %3, 128 \n\t" "addi %3, %3, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddasp 32, 40, 48 \n\t" "xvmaddasp 32, 40, 48 \n\t"
"xvmaddasp 33, 41, 49 \n\t" "xvmaddasp 33, 41, 49 \n\t"

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=16 * Macros for N=4 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x16', `
#else
.macro COPY_4x16 .macro COPY_4x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs46, o32, T1 stxvw4x vs46, o32, T1
stxvw4x vs47, o48, T1 stxvw4x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=16 * Macros for N=2 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x16', `
#else
.macro COPY_2x16 .macro COPY_2x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=16 * Macros for N=1 and M=16
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x16', `
#else
.macro COPY_1x16 .macro COPY_1x16
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs38, o32, T1 stxvw4x vs38, o32, T1
stxvw4x vs39, o48, T1 stxvw4x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs38, o0, T1 stxsspx vs38, o0, T1
stxsspx vs39, o4, T1 stxsspx vs39, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs34, o32, T1 stxvw4x vs34, o32, T1
stxvw4x vs35, o48, T1 stxvw4x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs34, o0, T1 stxsspx vs34, o0, T1
stxsspx vs35, o4, T1 stxsspx vs35, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
lxvw4x vs33, o16, A0 lxvw4x vs33, o16, A0
@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
stxvw4x vs33, o16, T1 stxvw4x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvw4x vs32, o0, A0 lxvw4x vs32, o0, A0
@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvw4x vs32, o0, T1 stxvw4x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
lxsspx vs33, o4, A0 lxsspx vs33, o4, A0
@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
stxsspx vs33, o4, T1 stxsspx vs33, o4, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxsspx vs32, o0, A0 lxsspx vs32, o0, A0
@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxsspx vs32, o0, T1 stxsspx vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 64 \n\t" "addi %4, %4, 64 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"
@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, 36 \n\t" // c * x "xvmulsp 40, 32, 36 \n\t" // c * x
"xvmulsp 41, 33, 36 \n\t" "xvmulsp 41, 33, 36 \n\t"

View File

@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t"
@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmulsp 40, 32, %x3 \n\t" "xvmulsp 40, 32, %x3 \n\t"
"xvmulsp 41, 33, %x3 \n\t" "xvmulsp 41, 33, %x3 \n\t"
@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
( (
"xxlxor %x3, %x3, %x3 \n\t" "xxlxor %x3, %x3, %x3 \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x %x3, 0, %2 \n\t" "stxvd2x %x3, 0, %2 \n\t"
"stxvd2x %x3, %4, %2 \n\t" "stxvd2x %x3, %4, %2 \n\t"
@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -32 \n\t" "addic. %1, %1, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10" "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
: :

View File

@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -32 \n\t" "addic. %2, %2, -32 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

View File

@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 38, 38, %x5 \n\t" "xvadddp 38, 38, %x5 \n\t"
"xvadddp 39, 39, %x6 \n\t" "xvadddp 39, 39, %x6 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvabsdp 48, 40 \n\t" "xvabsdp 48, 40 \n\t"
"xvabsdp 49, 41 \n\t" "xvabsdp 49, 41 \n\t"
@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
"xvadddp 32, 32, 36 \n\t" "xvadddp 32, 32, 36 \n\t"
"xxswapd 33, 32 \n\t" XXSWAPD_S(33,32)
"xsadddp %x0, 32, 33 \n" "xsadddp %x0, 32, 33 \n"
"#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n" "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"

View File

@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
__asm__ __asm__
( (
"xxspltd 32, %x19, 0 \n\t" // alpha_r XXSPLTD_S(32,%x19,0) // alpha_r
"xxspltd 33, %x20, 0 \n\t" // alpha_i XXSPLTD_S(33,%x20,0) // alpha_i
"lxvd2x 36, 0, %21 \n\t" // mvec "lxvd2x 36, 0, %21 \n\t" // mvec
@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x8, 40 \n\t" // exchange real and imag part XXSWAPD_S(%x8,40) // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part XXSWAPD_S(%x9,41) // exchange real and imag part
"xxswapd %x10, 42 \n\t" // exchange real and imag part XXSWAPD_S(%x10,42) // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part XXSWAPD_S(%x11,43) // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"xxswapd %x12, 44 \n\t" // exchange real and imag part XXSWAPD_S(%x12,44) // exchange real and imag part
"xxswapd %x13, 45 \n\t" // exchange real and imag part XXSWAPD_S(%x13,45) // exchange real and imag part
"xxswapd %x14, 46 \n\t" // exchange real and imag part XXSWAPD_S(%x14,46) // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part XXSWAPD_S(%x15,47) // exchange real and imag part
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 49, 41, 32 \n\t"
@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
"addi %16, %16, 64 \n\t" "addi %16, %16, 64 \n\t"
"xxswapd %x8, 40 \n\t" // exchange real and imag part XXSWAPD_S(%x8,40) // exchange real and imag part
"xxswapd %x9, 41 \n\t" // exchange real and imag part XXSWAPD_S(%x9,41) // exchange real and imag part
"lxvd2x 48, 0, %3 \n\t" // y0 "lxvd2x 48, 0, %3 \n\t" // y0
"lxvd2x 49, %22, %3 \n\t" // y1 "lxvd2x 49, %22, %3 \n\t" // y1
"xxswapd %x10, 42 \n\t" // exchange real and imag part XXSWAPD_S(%x10,42) // exchange real and imag part
"xxswapd %x11, 43 \n\t" // exchange real and imag part XXSWAPD_S(%x11,43) // exchange real and imag part
"lxvd2x 50, %23, %3 \n\t" // y2 "lxvd2x 50, %23, %3 \n\t" // y2
"lxvd2x 51, %24, %3 \n\t" // y3 "lxvd2x 51, %24, %3 \n\t" // y3
"xxswapd %x12, 44 \n\t" // exchange real and imag part XXSWAPD_S(%x12,44) // exchange real and imag part
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd %x13, 45 \n\t" // exchange real and imag part XXSWAPD_S(%x13,45) // exchange real and imag part
"lxvd2x %x4, 0, %3 \n\t" // y4 "lxvd2x %x4, 0, %3 \n\t" // y4
"lxvd2x %x5, %22, %3 \n\t" // y5 "lxvd2x %x5, %22, %3 \n\t" // y5
"xxswapd %x14, 46 \n\t" // exchange real and imag part XXSWAPD_S(%x14,46) // exchange real and imag part
"xxswapd %x15, 47 \n\t" // exchange real and imag part XXSWAPD_S(%x15,47) // exchange real and imag part
"lxvd2x %x6, %23, %3 \n\t" // y6 "lxvd2x %x6, %23, %3 \n\t" // y6
"lxvd2x %x7, %24, %3 \n\t" // y7 "lxvd2x %x7, %24, %3 \n\t" // y7
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddadp 49, 41, 32 \n\t" "xvmaddadp 49, 41, 32 \n\t"

View File

@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"
@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -16 \n\t" "addic. %1, %1, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"stxvd2x 32, 0, %3 \n\t" "stxvd2x 32, 0, %3 \n\t"
"stxvd2x 33, %5, %3 \n\t" "stxvd2x 33, %5, %3 \n\t"

View File

@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 0, 48 \n\t" // y0_i, y0_r XXSWAPD_S(0,48) // y0_i, y0_r
"xxswapd 1, 49 \n\t" // y1_i, y1_r XXSWAPD_S(1,49) // y1_i, y1_r
"xxswapd 2, 50 \n\t" // y2_i, y2_r XXSWAPD_S(2,50) // y2_i, y2_r
"xxswapd 3, 51 \n\t" // y3_i, y3_r XXSWAPD_S(3,51) // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
"xxswapd 8, 4 \n\t" // y0_i, y0_r XXSWAPD_S(8,4) // y0_i, y0_r
"xxswapd 9, 5 \n\t" // y1_i, y1_r XXSWAPD_S(9,5) // y1_i, y1_r
"xxswapd 10, 6 \n\t" // y2_i, y2_r XXSWAPD_S(10,6) // y2_i, y2_r
"xxswapd 11, 7 \n\t" // y3_i, y3_r XXSWAPD_S(11,7) // y3_i, y3_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 0,48 \n\t" // y0_i, y0_r XXSWAPD_S(0,48) // y0_i, y0_r
"xxswapd 1,49 \n\t" // y1_i, y1_r XXSWAPD_S(1,49) // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 2,50 \n\t" // y2_i, y2_r XXSWAPD_S(2,50) // y2_i, y2_r
"xxswapd 3,51 \n\t" // y3_i, y3_r XXSWAPD_S(3,51) // y3_i, y3_r
"xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
"lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
"xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
"lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
"xxswapd 8,4 \n\t" // y0_i, y0_r XXSWAPD_S(8,4) // y0_i, y0_r
"xxswapd 9,5 \n\t" // y1_i, y1_r XXSWAPD_S(9,5) // y1_i, y1_r
"addi %2, %2, 64 \n\t" "addi %2, %2, 64 \n\t"
"addi %3, %3, 64 \n\t" "addi %3, %3, 64 \n\t"
"xxswapd 10,6 \n\t" // y2_i, y2_r XXSWAPD_S(10,6) // y2_i, y2_r
"xxswapd 11,7 \n\t" // y3_i, y3_r XXSWAPD_S(11,7) // y3_i, y3_r
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
"xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i

File diff suppressed because it is too large Load Diff

View File

@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* Macros for N=4 and M=8 * Macros for N=4 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x8', `
#else
.macro COPY_4x8 .macro COPY_4x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs12, o32, T1 stxvd2x vs12, o32, T1
stxvd2x vs13, o48, T1 stxvd2x vs13, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=4 * Macros for N=4 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x4', `
#else
.macro COPY_4x4 .macro COPY_4x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=2 * Macros for N=4 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x2', `
#else
.macro COPY_4x2 .macro COPY_4x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=4 and M=1 * Macros for N=4 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_4x1', `
#else
.macro COPY_4x1 .macro COPY_4x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=8 * Macros for N=2 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x8', `
#else
.macro COPY_2x8 .macro COPY_2x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs46, o32, T1 stxvd2x vs46, o32, T1
stxvd2x vs47, o48, T1 stxvd2x vs47, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=4 * Macros for N=2 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x4', `
#else
.macro COPY_2x4 .macro COPY_2x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=2 * Macros for N=2 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x2', `
#else
.macro COPY_2x2 .macro COPY_2x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=2 and M=1 * Macros for N=2 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_2x1', `
#else
.macro COPY_2x1 .macro COPY_2x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=8 * Macros for N=1 and M=8
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x8', `
#else
.macro COPY_1x8 .macro COPY_1x8
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs38, o32, T1 stxvd2x vs38, o32, T1
stxvd2x vs39, o48, T1 stxvd2x vs39, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=4 * Macros for N=1 and M=4
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x4', `
#else
.macro COPY_1x4 .macro COPY_1x4
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs34, o32, T1 stxvd2x vs34, o32, T1
stxvd2x vs35, o48, T1 stxvd2x vs35, o48, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=2 * Macros for N=1 and M=2
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x2', `
#else
.macro COPY_1x2 .macro COPY_1x2
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
lxvd2x vs33, o16, A0 lxvd2x vs33, o16, A0
@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
stxvd2x vs33, o16, T1 stxvd2x vs33, o16, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif
/********************************************************************************************** /**********************************************************************************************
* Macros for N=1 and M=1 * Macros for N=1 and M=1
**********************************************************************************************/ **********************************************************************************************/
#if defined(_AIX)
define(`COPY_1x1', `
#else
.macro COPY_1x1 .macro COPY_1x1
#endif
lxvd2x vs32, o0, A0 lxvd2x vs32, o0, A0
addi A0, A0, 16 addi A0, A0, 16
@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
stxvd2x vs32, o0, T1 stxvd2x vs32, o0, T1
#if defined(_AIX)
')
#else
.endm .endm
#endif

View File

@ -40,8 +40,8 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
__asm__ __asm__
( (
"xxspltd 36, %x[cos], 0 \n\t" // load c to both dwords XXSPLTD_S(36,%x[cos],0) // load c to both dwords
"xxspltd 37, %x[sin], 0 \n\t" // load s to both dwords XXSPLTD_S(37,%x[sin],0) // load s to both dwords
"lxvd2x 32, 0, %[x_ptr] \n\t" // load x "lxvd2x 32, 0, %[x_ptr] \n\t" // load x
"lxvd2x 33, %[i16], %[x_ptr] \n\t" "lxvd2x 33, %[i16], %[x_ptr] \n\t"
@ -57,10 +57,10 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 64 \n\t" "addi %[y_ptr], %[y_ptr], 64 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"
@ -124,9 +124,9 @@ static void zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
"addi %[y_ptr], %[y_ptr], 128 \n\t" "addi %[y_ptr], %[y_ptr], 128 \n\t"
"addic. %[temp_n], %[temp_n], -4 \n\t" "addic. %[temp_n], %[temp_n], -4 \n\t"
"bgt+ 1b \n" "bgt+ one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 40, 32, 36 \n\t" // c * x "xvmuldp 40, 32, 36 \n\t" // c * x
"xvmuldp 41, 33, 36 \n\t" "xvmuldp 41, 33, 36 \n\t"

View File

@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"dcbt 0, %2 \n\t" "dcbt 0, %2 \n\t"
"xsnegdp 33, %x16 \n\t" // -alpha_i "xsnegdp 33, %x16 \n\t" // -alpha_i
"xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r XXSPLTD_S(32,%x15,0) // alpha_r , alpha_r
"xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i XXMRGHD_S(33,33,%x16) // -alpha_i , alpha_i
"lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
"lxvd2x 41, %17, %2 \n\t" "lxvd2x 41, %17, %2 \n\t"
@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 128 \n\t" "addi %2, %2, 128 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"ble 2f \n\t" "ble two%= \n\t"
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t" XXSWAPD_S(%x7,40)
"xxswapd %x8, 41 \n\t" XXSWAPD_S(%x8,41)
"xxswapd %x9, 42 \n\t" XXSWAPD_S(%x9,42)
"xxswapd %x10, 43 \n\t" XXSWAPD_S(%x10,43)
"xxswapd %x11, 44 \n\t" XXSWAPD_S(%x11,44)
"xxswapd %x12, 45 \n\t" XXSWAPD_S(%x12,45)
"xxswapd %x13, 46 \n\t" XXSWAPD_S(%x13,46)
"xxswapd %x14, 47 \n\t" XXSWAPD_S(%x14,47)
"xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
"xvmuldp %x8, %x8, 33 \n\t" "xvmuldp %x8, %x8, 33 \n\t"
@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"addi %2, %2, 256 \n\t" "addi %2, %2, 256 \n\t"
"addic. %1, %1, -8 \n\t" "addic. %1, %1, -8 \n\t"
"bgt 1b \n" "bgt one%= \n"
"2: \n\t" "two%=: \n\t"
"xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
"xvmuldp 49, 41, 32 \n\t" "xvmuldp 49, 41, 32 \n\t"
@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
"xvmuldp %x5, 46, 32 \n\t" "xvmuldp %x5, 46, 32 \n\t"
"xvmuldp %x6, 47, 32 \n\t" "xvmuldp %x6, 47, 32 \n\t"
"xxswapd %x7, 40 \n\t" XXSWAPD_S(%x7,40)
"xxswapd %x8, 41 \n\t" XXSWAPD_S(%x8,41)
"xxswapd %x9, 42 \n\t" XXSWAPD_S(%x9,42)
"xxswapd %x10, 43 \n\t" XXSWAPD_S(%x10,43)
"xxswapd %x11, 44 \n\t" XXSWAPD_S(%x11,44)
"xxswapd %x12, 45 \n\t" XXSWAPD_S(%x12,45)
"xxswapd %x13, 46 \n\t" XXSWAPD_S(%x13,46)
"xxswapd %x14, 47 \n\t" XXSWAPD_S(%x14,47)
"addi %2, %2, -128 \n\t" "addi %2, %2, -128 \n\t"

View File

@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
{ {
__asm__ __asm__
( (
".p2align 5 \n" ".align 5 \n"
"1: \n\t" "one%=: \n\t"
"lxvd2x 32, 0, %4 \n\t" "lxvd2x 32, 0, %4 \n\t"
"lxvd2x 33, %5, %4 \n\t" "lxvd2x 33, %5, %4 \n\t"
"lxvd2x 34, %6, %4 \n\t" "lxvd2x 34, %6, %4 \n\t"
@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
"addi %4, %4, 128 \n\t" "addi %4, %4, 128 \n\t"
"addic. %2, %2, -16 \n\t" "addic. %2, %2, -16 \n\t"
"bgt 1b \n" "bgt one%= \n"
"#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11" "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
: :

File diff suppressed because it is too large Load Diff

View File

@ -56,7 +56,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@ -67,7 +67,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@ -97,6 +97,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

View File

@ -53,7 +53,7 @@ DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CTRMMKERNEL = cgemm_kernel_8x2_haswell.S CTRMMKERNEL = cgemm_kernel_8x2_haswell.S
CGEMMKERNEL = cgemm_kernel_8x2_haswell.S CGEMMKERNEL = cgemm_kernel_8x2_haswell.c
CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMINCOPY = ../generic/zgemm_ncopy_8.c
CGEMMITCOPY = ../generic/zgemm_tcopy_8.c CGEMMITCOPY = ../generic/zgemm_tcopy_8.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@ -64,7 +64,7 @@ CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S ZTRMMKERNEL = zgemm_kernel_4x2_haswell.S
ZGEMMKERNEL = zgemm_kernel_4x2_haswell.S ZGEMMKERNEL = zgemm_kernel_4x2_haswell.c
ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c ZGEMMINCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c ZGEMMITCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
@ -94,6 +94,6 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S CGEMM3MKERNEL = cgemm3m_kernel_8x4_haswell.c
ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S

View File

@ -0,0 +1,279 @@
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */
#include "common.h"
#include <stdint.h>
//recommended settings: GEMM_P = 320, GEMM_Q = 320.
/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
#define KERNEL_k1m8n1 \
"vmovups (%0),%%ymm1; addq $32,%0;"\
"vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
"addq $4,%1;"
#define KERNEL_h_k1m8n2 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
"vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
#define KERNEL_h_k1m8n4 \
KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
"vbroadcastsd ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
#define unit_init_m8n4(c1,c2,c3,c4) \
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m8n8 INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
#define SAVE_m8n1 \
"vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\
"vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\
"vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);"
#define unit_save_m8n2(c1,c2) \
"vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\
"vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\
"vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\
"vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\
"vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\
"vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%ymm6,%%ymm7)
#define SAVE_m8n8 SAVE_m8n4 unit_save_m8n2(%%ymm8,%%ymm9) unit_save_m8n2(%%ymm10,%%ymm11)
#define SAVE_m8n12 SAVE_m8n8 unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
#define COMPUTE_m8(ndim) \
INIT_m8n##ndim\
"movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\
"cmpq $24,%4; jb "#ndim"882f;"\
#ndim"881:\n\t"\
"cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"prefetcht1 (%8); addq $16,%8;"\
"subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\
"movq %2,%5;"\
#ndim"882:\n\t"\
"testq %4,%4; jz "#ndim"883f;"\
"prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
KERNEL_k1m8n##ndim\
"decq %4; jmp "#ndim"882b;"\
#ndim"883:\n\t"\
"prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
SAVE_m8n##ndim "addq $64,%2;"
/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
#define KERNEL_k1m4n1 \
"vmovups (%0),%%xmm1; addq $16,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,%1;"
#define KERNEL_h_k1m4n2 \
"vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
"vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
#define KERNEL_h_k1m4n4 \
KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
"vmovddup ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
"vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1)
#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2)
#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
#define unit_init_m4n4(c1,c2,c3,c4) \
"vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
#define INIT_m4n8 INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
#define SAVE_m4n1 \
"vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\
"vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);"
#define unit_save_m4n2(c1,c2) \
"vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\
"vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\
"vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\
"vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\
"vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%xmm6,%%xmm7)
#define SAVE_m4n8 SAVE_m4n4 unit_save_m4n2(%%xmm8,%%xmm9) unit_save_m4n2(%%xmm10,%%xmm11)
#define SAVE_m4n12 SAVE_m4n8 unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
#define COMPUTE_m4(ndim) \
INIT_m4n##ndim\
"movq %%r13,%4; movq %%r14,%1;"\
#ndim"442:\n\t"\
"testq %4,%4; jz "#ndim"443f;"\
KERNEL_k1m4n##ndim\
"decq %4; jmp "#ndim"442b;"\
#ndim"443:\n\t"\
SAVE_m4n##ndim "addq $32,%2;"
/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define KERNEL_k1m2n1 \
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"addq $4,%1;"
#define SAVE_m2n1 \
"vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);"
#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define KERNEL_k1m2n2 \
"vmovsd (%0),%%xmm1; addq $8,%0;"\
"vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
"vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
"addq $8,%1;"
#define SAVE_m2n2 SAVE_m2n1 \
"vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);"
#define INIT_m2n4 INIT_m2n2
#define INIT_m2n8 INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
#define KERNEL_k1m2n4 \
"vmovups (%1),%%xmm3; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
"addq $8,%0;"
#define KERNEL_k1m2n8 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
"vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
"addq $8,%0;"
#define KERNEL_k1m2n12 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
"vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
"addq $8,%0;"
#define unit_save_m2n4(c1,c2) \
"vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\
"vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
"vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"\
"vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
"vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
"leaq (%5,%3,2),%5;"
#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
#define SAVE_m2n8 SAVE_m2n4 unit_save_m2n4(%%xmm6,%%xmm7)
#define SAVE_m2n12 SAVE_m2n8 unit_save_m2n4(%%xmm8,%%xmm9)
#define COMPUTE_m2(ndim) \
INIT_m2n##ndim\
"movq %%r13,%4; movq %%r14,%1;"\
#ndim"222:\n\t"\
"testq %4,%4; jz "#ndim"223f;"\
KERNEL_k1m2n##ndim\
"decq %4; jmp "#ndim"222b;"\
#ndim"223:\n\t"\
SAVE_m2n##ndim "addq $16,%2;"
/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
#define KERNEL_k1m1n1 \
"vmovss (%1),%%xmm3; addq $4,%1;"\
"vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_m1n1 \
"vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
#define INIT_m1n2 INIT_m1n1
#define KERNEL_k1m1n2 \
"vmovsd (%1),%%xmm3; addq $8,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define SAVE_m1n2 \
"vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
"vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);"
#define INIT_m1n4 INIT_m1n2
#define INIT_m1n8 INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
#define KERNEL_k1m1n4 \
"vmovups (%1),%%xmm3; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
"addq $4,%0;"
#define KERNEL_k1m1n8 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
"vbroadcastss (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
"addq $4,%0;"
#define KERNEL_k1m1n12 \
"vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
"vbroadcastss (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
"addq $4,%0;"
#define unit_save_m1n4(c1) \
"vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
"vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
"vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
"vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
#define SAVE_m1n8 SAVE_m1n4 unit_save_m1n4(%%xmm5)
#define SAVE_m1n12 SAVE_m1n8 unit_save_m1n4(%%xmm6)
#define COMPUTE_m1(ndim) \
INIT_m1n##ndim\
"movq %%r13,%4; movq %%r14,%1;"\
#ndim"112:\n\t"\
"testq %4,%4; jz "#ndim"113f;"\
KERNEL_k1m1n##ndim\
"decq %4; jmp "#ndim"112b;"\
#ndim"113:\n\t"\
SAVE_m1n##ndim "addq $8,%2;"
/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */
#define COMPUTE(ndim) {\
next_b = b_pointer + ndim * K;\
__asm__ __volatile__(\
"vbroadcastsd (%6),%%ymm0;"\
"movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
"cmpq $8,%7;jb 33101"#ndim"f;"\
"33109"#ndim":\n\t"\
COMPUTE_m8(ndim)\
"subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\
"33101"#ndim":\n\t"\
"cmpq $4,%7;jb 33103"#ndim"f;"\
COMPUTE_m4(ndim)\
"subq $4,%7;"\
"33103"#ndim":\n\t"\
"cmpq $2,%7;jb 33104"#ndim"f;"\
COMPUTE_m2(ndim)\
"subq $2,%7;"\
"33104"#ndim":\n\t"\
"testq %7,%7;jz 33105"#ndim"f;"\
COMPUTE_m1(ndim)\
"33105"#ndim":\n\t"\
"movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
:"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
::"r11","r12","r13","r14","r15",\
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
}
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
{
if(m==0||n==0||k==0) return 0;
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
float constval[2]; constval[0] = alphar; constval[1] = alphai;
float *const_val=constval;
int64_t M = (int64_t)m, K = (int64_t)k;
BLASLONG n_count = n;
float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
for(;n_count>11;n_count-=12) COMPUTE(12)
for(;n_count>7;n_count-=8) COMPUTE(8)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) COMPUTE(2)
if(n_count>0) COMPUTE(1)
return 0;
}

View File

@ -0,0 +1,292 @@
#include "common.h"
#include <stdint.h>
/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define A_CONJ 0
#define B_CONJ 0
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define A_CONJ 1
#define B_CONJ 0
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define A_CONJ 0
#define B_CONJ 1
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define A_CONJ 1
#define B_CONJ 1
#endif
/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */
/* m=8, ymm 0-3 temp, ymm 4-15 acc */
#if A_CONJ == B_CONJ
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
#else
#define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
#define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
#endif
/* expanded accumulators for m8n1 and m8n2 */
#define KERNEL_k1m8n1 \
"vbroadcastsd (%1),%%ymm0; addq $8,%1;"\
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\
"vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\
"addq $64,%0;"
#define KERNEL_k1m8n2 \
"vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\
"vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\
"vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\
"addq $64,%0;"
/* contracted accumulators for m8n4 and m8n6 */
#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
"vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\
"vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc)
#define KERNEL_1_k1m8n4 \
"vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
#define KERNEL_2_k1m8n4 \
"vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;"
#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;"
#define zero_4ymm(no1,no2,no3,no4) \
"vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
"vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
/* initialization and storage macros */
#define INIT_m8n1 zero_4ymm(4,5,6,7)
#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15)
#if A_CONJ == B_CONJ
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
#else
#define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
#endif
#if A_CONJ == 0
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
"vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");"
#else
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
"vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
"vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");"
#endif
#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
#define SAVE_m8n2 SAVE_m8n1\
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
#define SAVE_m8n4 save_init_m8\
save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\
save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
#define COMPUTE_m8(ndim) \
"movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\
"movq $10,%5; movq $84,%%r15;"\
#ndim"8881:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\
#ndim"8882:\n\t"\
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
#ndim"8883:\n\t"\
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
#define KERNEL_k1m4n1 \
"vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
"vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;"
#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \
"vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\
"vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r)
#define KERNEL_h_k1m4n2 \
"vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1)
#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1)
#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2)
#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;"
#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
#define INIT_m4n2 zero_4ymm(4,5,6,7)
#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11)
#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
#define COMPUTE_m4(ndim) \
"movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"4442f;"\
#ndim"4441:\n\t"\
KERNEL_k1m4n##ndim\
"decq %5; jnz "#ndim"4441b;"\
#ndim"4442:\n\t"\
SAVE_m4n##ndim
/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
#if A_CONJ == B_CONJ
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#else
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#endif
#define KERNEL_h_k1m2n1 \
"vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\
"vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5)
#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\
"vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7)
#define acc_m2n2_exp(c1,c2,c3,c4,...)\
"vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\
"vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4)
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;"
#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";"
#define INIT_m2n1 zero_2xmm(4,5)
#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7)
#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11)
#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15)
#if A_CONJ == B_CONJ
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
#else
#define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
#endif
#if A_CONJ == 0
#define save_1xmm(c,tmp,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\
"vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;"
#else
#define save_1xmm(c,tmp,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;"
#endif
#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1)
#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1)
#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1)
#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1)
#define COMPUTE_m2(ndim) \
"movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"2222f;"\
#ndim"2221:\n\t"\
KERNEL_k1m2n##ndim\
"decq %5; jnz "#ndim"2221b;"\
#ndim"2222:\n\t"\
SAVE_m2n##ndim
/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
#if A_CONJ == B_CONJ
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
#else
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
#endif
#define KERNEL_k1m1n1 \
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
"vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5)
#define KERNEL_h_k1m1n2 \
"vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
"vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5)
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7)
#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9)
#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;"
#define INIT_m1n1 zero_2xmm(4,5)
#define INIT_m1n2 zero_2xmm(4,5)
#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7)
#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9)
#if A_CONJ == 0
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\
"vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);"
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
"vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\
"vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\
"vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);"
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
"vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
"vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\
"vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1)
#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1)
#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1)
#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1)
#define COMPUTE_m1(ndim) \
"movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"1112f;"\
#ndim"1111:\n\t"\
KERNEL_k1m1n##ndim\
"decq %5; jnz "#ndim"1111b;"\
#ndim"1112:\n\t"\
SAVE_m1n##ndim
#define COMPUTE(ndim) {\
b_pref = b_ptr + ndim * K *2;\
__asm__ __volatile__ (\
"movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\
"cmpq $8,%7; jb "#ndim"9992f;"\
#ndim"9991:\n\t"\
COMPUTE_m8(ndim)\
"subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\
#ndim"9992:\n\t"\
"cmpq $4,%7; jb "#ndim"9993f;"\
COMPUTE_m4(ndim) "subq $4,%7;"\
#ndim"9993:\n\t"\
"cmpq $2,%7; jb "#ndim"9994f;"\
COMPUTE_m2(ndim) "subq $2,%7;"\
#ndim"9994:\n\t"\
"testq %7,%7; jz "#ndim"9995f;"\
COMPUTE_m1(ndim)\
#ndim"9995:\n\t"\
"movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
"xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
}
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
{
if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
#if A_CONJ == B_CONJ
float const_val[2] = {-alphar, -alphai};
#else
float const_val[2] = {alphar, alphai};
#endif
int64_t M = (int64_t)m, K = (int64_t)k;
BLASLONG n_count = n;
float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
for(;n_count>5;n_count-=6) COMPUTE(6)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) COMPUTE(2)
if(n_count>0) COMPUTE(1)
return 0;
}

View File

@ -0,0 +1,240 @@
#include "common.h"
#include <stdint.h>
/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define A_CONJ 0
#define B_CONJ 0
#endif
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define A_CONJ 1
#define B_CONJ 0
#endif
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define A_CONJ 0
#define B_CONJ 1
#endif
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define A_CONJ 1
#define B_CONJ 1
#endif
/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */
/* m=4, ymm 0-3 temp, ymm 4-15 acc */
#if A_CONJ == B_CONJ
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
#define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
#else
#define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
#define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
#endif
/* expanded accumulators for m4n1 and m4n2 */
#define KERNEL_k1m4n1 \
"vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\
"vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\
"vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\
"addq $64,%0;"
#define KERNEL_k1m4n2 \
"vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\
"vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\
"vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\
"addq $64,%0;"
/* contracted accumulators for m4n4 and m4n6 */
#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
"vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\
"vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc)
#define KERNEL_1_k1m4n4 \
"vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
#define KERNEL_2_k1m4n4 \
"vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;"
#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;"
#define zero_4ymm(no1,no2,no3,no4) \
"vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
"vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
/* initialization and storage macros */
#define INIT_m4n1 zero_4ymm(4,5,6,7)
#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
#if A_CONJ == B_CONJ
#define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
#else
#define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
#endif
#if A_CONJ == 0
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
"vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");"
#else
#define save_1ymm(c,tmp,off,alpr,alpi,...) \
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
"vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");"
#endif
#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
#define SAVE_m4n2 SAVE_m4n1\
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
#define SAVE_m4n4 save_init_m4\
save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
#define COMPUTE_m4(ndim) \
"movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\
"movq $10,%5; movq $84,%%r15;"\
#ndim"4441:\n\t"\
"prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
"prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
"addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
"movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
#ndim"4442:\n\t"\
"prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\
#ndim"4443:\n\t"\
"prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim
/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
#define KERNEL_k1m2n1 \
"vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\
"vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;"
#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \
"vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\
"vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r)
#define KERNEL_h_k1m2n2 \
"vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1)
#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;"
#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;"
#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
#define INIT_m2n2 zero_4ymm(4,5,6,7)
#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11)
#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15)
#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\
cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\
cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
#define COMPUTE_m2(ndim) \
"movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"2222f;"\
#ndim"2221:\n\t"\
KERNEL_k1m2n##ndim\
"decq %5; jnz "#ndim"2221b;"\
#ndim"2222:\n\t"\
SAVE_m2n##ndim
/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */
#if A_CONJ == B_CONJ
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
#else
#define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
#define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
#endif
#define KERNEL_k1m1n1 \
"vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\
"vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5)
#define KERNEL_h_k1m1n2 \
"vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\
"vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5)
#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7)
#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9)
#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;"
#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;"
#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;"
#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;"
#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;"
#if A_CONJ == B_CONJ
#define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
#else
#define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
#endif
#if A_CONJ == 0
#define save_m1n1(c,tmp,alpr,alpi) \
"vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\
"vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);"
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
"vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\
"vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
#else
#define save_m1n1(c,tmp,alpr,alpi) \
"vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\
"vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);"
#define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
"vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
"vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\
"vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
#endif
#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1)
#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1)
#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1)
#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1)
#define COMPUTE_m1(ndim) \
"movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
"testq %5,%5; jz "#ndim"1112f;"\
#ndim"1111:\n\t"\
KERNEL_k1m1n##ndim\
"decq %5; jnz "#ndim"1111b;"\
#ndim"1112:\n\t"\
SAVE_m1n##ndim
#define COMPUTE(ndim) {\
b_pref = b_ptr + ndim * K *2;\
__asm__ __volatile__ (\
"movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\
"cmpq $4,%7; jb "#ndim"9992f;"\
#ndim"9991:\n\t"\
COMPUTE_m4(ndim)\
"subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\
#ndim"9992:\n\t"\
"cmpq $2,%7; jb "#ndim"9993f;"\
COMPUTE_m2(ndim) "subq $2,%7;"\
#ndim"9993:\n\t"\
"testq %7,%7; jz "#ndim"9994f;"\
COMPUTE_m1(ndim)\
#ndim"9994:\n\t"\
"movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
:"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
"xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
}
int __attribute__ ((noinline))
CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
{
if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2;
#if A_CONJ == B_CONJ
double const_val[2] = {-alphar, -alphai};
#else
double const_val[2] = {alphar, alphai};
#endif
int64_t M = (int64_t)m, K = (int64_t)k;
BLASLONG n_count = n;
double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
for(;n_count>5;n_count-=6) COMPUTE(6)
for(;n_count>3;n_count-=4) COMPUTE(4)
for(;n_count>1;n_count-=2) COMPUTE(2)
if(n_count>0) COMPUTE(1)
return 0;
}

32
param.h
View File

@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 512
#define CGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
#define SGEMM_DEFAULT_Q 320 #define SGEMM_DEFAULT_Q 320
@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256
#endif #endif
#define CGEMM_DEFAULT_Q 192 #define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 192
#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R 13824 #define DGEMM_DEFAULT_R 13824
@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_R xgemm_r
#define XGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128
#define CGEMM3M_DEFAULT_UNROLL_N 8 #define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 4 #define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8
#define ZGEMM3M_DEFAULT_UNROLL_M 2 #define ZGEMM3M_DEFAULT_UNROLL_M 2
#define CGEMM3M_DEFAULT_P 448 #define CGEMM3M_DEFAULT_P 320
#define ZGEMM3M_DEFAULT_P 224 #define ZGEMM3M_DEFAULT_P 224
#define XGEMM3M_DEFAULT_P 112 #define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_Q 320
#define ZGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224
#define XGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288 #define CGEMM3M_DEFAULT_R 12288
@ -1571,8 +1571,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_P 768 #define SGEMM_DEFAULT_P 768
#define DGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 512
#define CGEMM_DEFAULT_P 384 #define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 256 #define ZGEMM_DEFAULT_P 192
#ifdef WINDOWS_ABI #ifdef WINDOWS_ABI
#define SGEMM_DEFAULT_Q 320 #define SGEMM_DEFAULT_Q 320
@ -1581,8 +1581,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256 #define DGEMM_DEFAULT_Q 256
#endif #endif
#define CGEMM_DEFAULT_Q 192 #define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 128 #define ZGEMM_DEFAULT_Q 192
#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R 13824 #define DGEMM_DEFAULT_R 13824
@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_R xgemm_r #define XGEMM_DEFAULT_R xgemm_r
#define XGEMM_DEFAULT_Q 128 #define XGEMM_DEFAULT_Q 128
#define CGEMM3M_DEFAULT_UNROLL_N 8 #define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 4 #define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 8 #define ZGEMM3M_DEFAULT_UNROLL_N 8
#define ZGEMM3M_DEFAULT_UNROLL_M 2 #define ZGEMM3M_DEFAULT_UNROLL_M 2
#define CGEMM3M_DEFAULT_P 448 #define CGEMM3M_DEFAULT_P 320
#define ZGEMM3M_DEFAULT_P 224 #define ZGEMM3M_DEFAULT_P 224
#define XGEMM3M_DEFAULT_P 112 #define XGEMM3M_DEFAULT_P 112
#define CGEMM3M_DEFAULT_Q 224 #define CGEMM3M_DEFAULT_Q 320
#define ZGEMM3M_DEFAULT_Q 224 #define ZGEMM3M_DEFAULT_Q 224
#define XGEMM3M_DEFAULT_Q 224 #define XGEMM3M_DEFAULT_Q 224
#define CGEMM3M_DEFAULT_R 12288 #define CGEMM3M_DEFAULT_R 12288