Merge branch 'develop' of https://github.com/ashwinyes/OpenBLAS into ashwinyes-develop
This commit is contained in:
commit
faf0811483
|
@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
|
|||
FCOMMON_OPT += -march=armv8-a
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), CORTEXA57)
|
||||
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
|
||||
endif
|
||||
|
||||
|
|
|
@ -74,3 +74,5 @@ ARMV5
|
|||
|
||||
7.ARM 64-bit CPU:
|
||||
ARMV8
|
||||
CORTEXA57
|
||||
|
||||
|
|
|
@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
|
|||
srandom(getpid());
|
||||
#endif
|
||||
|
||||
for(j = 0; j < m; j++){
|
||||
for(j = 0; j < to; j++){
|
||||
for(i = 0; i < to * COMPSIZE; i++){
|
||||
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
1
common.h
1
common.h
|
@ -86,6 +86,7 @@ extern "C" {
|
|||
#if !defined(_MSC_VER)
|
||||
#include <unistd.h>
|
||||
#endif
|
||||
#include <time.h>
|
||||
|
||||
#ifdef OS_LINUX
|
||||
#include <malloc.h>
|
||||
|
|
|
@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
|
|||
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
|
||||
|
||||
#define PROLOGUE \
|
||||
.text ;\
|
||||
.align 4 ;\
|
||||
.global REALNAME ;\
|
||||
.func REALNAME ;\
|
||||
.type REALNAME, %function ;\
|
||||
REALNAME:
|
||||
|
||||
#define EPILOGUE
|
||||
|
@ -107,7 +109,11 @@ REALNAME:
|
|||
#endif
|
||||
#define HUGE_PAGESIZE ( 4 << 20)
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define BUFFER_SIZE (128 << 20)
|
||||
#else
|
||||
#define BUFFER_SIZE (16 << 20)
|
||||
#endif
|
||||
|
||||
|
||||
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
|
||||
|
|
|
@ -29,12 +29,19 @@
|
|||
|
||||
#define CPU_UNKNOWN 0
|
||||
#define CPU_ARMV8 1
|
||||
#define CPU_CORTEXA57 2
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"ARMV8"
|
||||
"UNKNOWN",
|
||||
"ARMV8" ,
|
||||
"CORTEXA57"
|
||||
};
|
||||
|
||||
static char *cpuname_lower[] = {
|
||||
"unknown",
|
||||
"armv8" ,
|
||||
"cortexa57"
|
||||
};
|
||||
|
||||
int get_feature(char *search)
|
||||
{
|
||||
|
@ -53,13 +60,13 @@ int get_feature(char *search)
|
|||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
fclose(infile);
|
||||
|
||||
|
||||
if( p == NULL ) return;
|
||||
if( p == NULL ) return 0;
|
||||
|
||||
t = strtok(p," ");
|
||||
while( t = strtok(NULL," "))
|
||||
|
@ -82,11 +89,30 @@ int detect(void)
|
|||
p = (char *) NULL ;
|
||||
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
|
||||
if (!strncmp("CPU part", buffer, 8))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
if(p != NULL) {
|
||||
if (strstr(p, "0xd07")) {
|
||||
return CPU_CORTEXA57;
|
||||
}
|
||||
}
|
||||
|
||||
p = (char *) NULL ;
|
||||
infile = fopen("/proc/cpuinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile))
|
||||
{
|
||||
|
||||
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
|
||||
(!strncmp("CPU architecture", buffer, 16)))
|
||||
{
|
||||
p = strchr(buffer, ':') + 2;
|
||||
break;
|
||||
|
@ -100,7 +126,7 @@ int detect(void)
|
|||
|
||||
if (strstr(p, "AArch64"))
|
||||
{
|
||||
return CPU_ARMV8;
|
||||
return CPU_ARMV8;
|
||||
|
||||
}
|
||||
|
||||
|
@ -118,23 +144,13 @@ char *get_corename(void)
|
|||
|
||||
void get_architecture(void)
|
||||
{
|
||||
printf("ARM");
|
||||
printf("ARM64");
|
||||
}
|
||||
|
||||
void get_subarchitecture(void)
|
||||
{
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV8:
|
||||
printf("ARMV8");
|
||||
break;
|
||||
|
||||
default:
|
||||
printf("UNKNOWN");
|
||||
break;
|
||||
}
|
||||
printf("%s", cpuname[d]);
|
||||
}
|
||||
|
||||
void get_subdirname(void)
|
||||
|
@ -160,26 +176,32 @@ void get_cpuconfig(void)
|
|||
printf("#define L2_ASSOCIATIVE 4\n");
|
||||
break;
|
||||
|
||||
|
||||
case CPU_CORTEXA57:
|
||||
printf("#define CORTEXA57\n");
|
||||
printf("#define HAVE_VFP\n");
|
||||
printf("#define HAVE_VFPV3\n");
|
||||
printf("#define HAVE_NEON\n");
|
||||
printf("#define HAVE_VFPV4\n");
|
||||
printf("#define L1_CODE_SIZE 49152\n");
|
||||
printf("#define L1_CODE_LINESIZE 64\n");
|
||||
printf("#define L1_CODE_ASSOCIATIVE 3\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
printf("#define L1_DATA_LINESIZE 64\n");
|
||||
printf("#define L1_DATA_ASSOCIATIVE 2\n");
|
||||
printf("#define L2_SIZE 2097152\n");
|
||||
printf("#define L2_LINESIZE 64\n");
|
||||
printf("#define L2_ASSOCIATIVE 16\n");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void get_libname(void)
|
||||
{
|
||||
|
||||
int d = detect();
|
||||
switch (d)
|
||||
{
|
||||
|
||||
case CPU_ARMV8:
|
||||
printf("armv8\n");
|
||||
break;
|
||||
|
||||
}
|
||||
printf("%s", cpuname_lower[d]);
|
||||
}
|
||||
|
||||
|
||||
void get_features(void)
|
||||
{
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG incx, incy;
|
||||
BLASLONG incx;
|
||||
BLASLONG m_from, m_to, i;
|
||||
#ifndef COMPLEX
|
||||
FLOAT result;
|
||||
|
@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
y = (FLOAT *)args -> c;
|
||||
|
||||
incx = args -> ldb;
|
||||
incy = args -> ldc;
|
||||
|
||||
m_from = 0;
|
||||
m_to = args -> m;
|
||||
|
|
|
@ -43,7 +43,7 @@
|
|||
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG lda, incx, incy;
|
||||
BLASLONG incx, incy;
|
||||
BLASLONG i, m_from, m_to;
|
||||
FLOAT alpha_r;
|
||||
#ifdef COMPLEX
|
||||
|
@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
|
||||
incx = args -> lda;
|
||||
incy = args -> ldb;
|
||||
lda = args -> ldc;
|
||||
|
||||
alpha_r = *((FLOAT *)args -> alpha + 0);
|
||||
#ifdef COMPLEX
|
||||
|
|
|
@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
BLASLONG incx;
|
||||
BLASLONG i, m_from, m_to;
|
||||
FLOAT alpha_r;
|
||||
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
|
||||
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
|
||||
FLOAT alpha_i;
|
||||
#endif
|
||||
|
||||
|
@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
|
|||
incx = args -> lda;
|
||||
|
||||
alpha_r = *((FLOAT *)args -> alpha + 0);
|
||||
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
|
||||
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
|
||||
alpha_i = *((FLOAT *)args -> alpha + 1);
|
||||
#endif
|
||||
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
|
||||
|
||||
FLOAT *a, *x, *y;
|
||||
BLASLONG lda, incx, incy;
|
||||
BLASLONG lda, incx;
|
||||
BLASLONG m_from, m_to;
|
||||
|
||||
a = (FLOAT *)args -> a;
|
||||
|
@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
|
|||
|
||||
lda = args -> lda;
|
||||
incx = args -> ldb;
|
||||
incy = args -> ldc;
|
||||
|
||||
m_from = 0;
|
||||
m_to = args -> m;
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,12 +43,10 @@
|
|||
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -43,12 +43,10 @@
|
|||
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
|
|||
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
|
||||
|
||||
BLASLONG i;
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
BLASLONG length;
|
||||
#if (TRANSA == 2) || (TRANSA == 4)
|
||||
|
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
|
|||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
|
||||
COPY_K(n, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT atemp1, atemp2, btemp1, btemp2;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT atemp1, atemp2, btemp1, btemp2;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT ar, ai, br, bi, ratio, den;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
|
|||
#ifndef UNIT
|
||||
FLOAT ar, ai, br, bi, ratio, den;
|
||||
#endif
|
||||
FLOAT *gemvbuffer = (FLOAT *)buffer;
|
||||
FLOAT *B = b;
|
||||
|
||||
if (incb != 1) {
|
||||
B = buffer;
|
||||
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
|
||||
COPY_K(m, b, incb, buffer, 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
|
|||
blas_queue_t queue[MAX_CPU_NUMBER];
|
||||
|
||||
BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
|
||||
BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;
|
||||
BLASLONG procs, num_cpu_m, num_cpu_n;
|
||||
|
||||
BLASLONG width, i, j;
|
||||
BLASLONG divM, divN;
|
||||
|
|
|
@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
BLASLONG is, min_i, div_n;
|
||||
|
||||
BLASLONG i, current;
|
||||
BLASLONG l1stride, l2size;
|
||||
BLASLONG l1stride;
|
||||
|
||||
#ifdef TIMING
|
||||
BLASULONG rpcc_counter;
|
||||
|
@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
|
|||
#endif
|
||||
) return 0;
|
||||
|
||||
l2size = GEMM_P * GEMM_Q;
|
||||
|
||||
#if 0
|
||||
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
|
||||
mypos, m_from, m_to, n_from, n_to, N_from, N_to);
|
||||
|
@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
|
|||
n = n_to - n_from;
|
||||
}
|
||||
|
||||
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
|
||||
if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
|
||||
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -914,7 +914,6 @@ static volatile struct {
|
|||
} memory[NUM_BUFFERS];
|
||||
|
||||
static int memory_initialized = 0;
|
||||
static void gotoblas_memory_init(void);
|
||||
|
||||
/* Memory allocation routine */
|
||||
/* procpos ... indicates where it comes from */
|
||||
|
|
18
getarch.c
18
getarch.c
|
@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
|
||||
#define LIBNAME "armv8"
|
||||
#define CORENAME "XGENE1"
|
||||
#else
|
||||
#define CORENAME "ARMV8"
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_CORTEXA57
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "ARM64"
|
||||
#define SUBARCHITECTURE "ARMV8"
|
||||
#define SUBDIRNAME "arm64"
|
||||
#define ARCHCONFIG "-DCORTEXA57 " \
|
||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
|
||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
|
||||
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
|
||||
#define LIBNAME "cortexa57"
|
||||
#define CORENAME "CORTEXA57"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifndef FORCE
|
||||
|
||||
|
|
|
@ -91,6 +91,27 @@
|
|||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
#define MODE (BLAS_XDOUBLE | BLAS_REAL)
|
||||
#elif defined(DOUBLE)
|
||||
#define MODE (BLAS_DOUBLE | BLAS_REAL)
|
||||
#else
|
||||
#define MODE (BLAS_SINGLE | BLAS_REAL)
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX)
|
||||
#elif defined(DOUBLE)
|
||||
#define MODE (BLAS_DOUBLE | BLAS_COMPLEX)
|
||||
#else
|
||||
#define MODE (BLAS_SINGLE | BLAS_COMPLEX)
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||
#ifndef GEMM3M
|
||||
#ifndef HEMM
|
||||
|
@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
|
|||
FLOAT *buffer;
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
int nodes;
|
||||
#endif
|
||||
|
@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
FLOAT *buffer;
|
||||
FLOAT *sa, *sb;
|
||||
|
||||
#ifdef SMP
|
||||
#ifndef COMPLEX
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_REAL;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_REAL;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_REAL;
|
||||
#endif
|
||||
#else
|
||||
#ifdef XDOUBLE
|
||||
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
|
||||
#elif defined(DOUBLE)
|
||||
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
|
||||
#else
|
||||
int mode = BLAS_SINGLE | BLAS_COMPLEX;
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(SMP) && !defined(NO_AFFINITY)
|
||||
int nodes;
|
||||
#endif
|
||||
|
@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
|
||||
args.nthreads /= nodes;
|
||||
|
||||
gemm_thread_mn(mode, &args, NULL, NULL,
|
||||
gemm_thread_mn(MODE, &args, NULL, NULL,
|
||||
symm[4 | (side << 1) | uplo ], sa, sb, nodes);
|
||||
|
||||
} else {
|
||||
|
@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
|
|||
|
||||
#else
|
||||
|
||||
GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
|
||||
GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
FLOAT beta_i = BETA[1];
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
|
|||
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
|
||||
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
FLOAT alpha_r = ALPHA[0];
|
||||
FLOAT alpha_i = ALPHA[1];
|
||||
FLOAT *buffer;
|
||||
int trans, uplo;
|
||||
int uplo;
|
||||
blasint info;
|
||||
#ifdef SMP
|
||||
int nthreads;
|
||||
|
@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
|
|||
|
||||
PRINT_DEBUG_CNAME;
|
||||
|
||||
trans = -1;
|
||||
uplo = -1;
|
||||
info = 0;
|
||||
|
||||
|
|
|
@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
|
||||
|
||||
$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE
|
|||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
|
||||
|
||||
$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL)
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
|
||||
|
||||
|
||||
|
|
|
@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(maxf);
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) > ABS(maxf) )
|
||||
if( ABS(x[ix]) > maxf )
|
||||
{
|
||||
maxf = ABS(x[ix]);
|
||||
}
|
||||
|
|
|
@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(minf);
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) < ABS(minf) )
|
||||
if( ABS(x[ix]) < minf )
|
||||
{
|
||||
minf = ABS(x[ix]);
|
||||
}
|
||||
|
|
|
@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
if (n < 0 || inc_x < 1 ) return(sumf);
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
n *= inc_x;
|
||||
while(i < n)
|
||||
|
|
|
@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT maxf=0.0;
|
||||
BLASLONG max=0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(max);
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
maxf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( ABS(x[ix]) > ABS(maxf) )
|
||||
if( ABS(x[ix]) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf = ABS(x[ix]);
|
||||
|
|
|
@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT minf=0.0;
|
||||
BLASLONG min=0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(min);
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
minf=ABS(x[0]);
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT maxf=0.0;
|
||||
BLASLONG max=0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(max);
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
maxf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT minf=0.0;
|
||||
BLASLONG min=0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(min);
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
minf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf[2];
|
||||
FLOAT maxf;
|
||||
BLASLONG max=0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(max);
|
||||
if (n <= 0 || inc_x <= 0) return(max);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
maxf[0] = ABS(x[ix]);
|
||||
maxf[1] = ABS(x[ix+1]);
|
||||
maxf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > CABS1(maxf,0) )
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf[0] = ABS(x[ix]);
|
||||
maxf[1] = ABS(x[ix+1]);
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
|
|
@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf[2];
|
||||
FLOAT minf;
|
||||
BLASLONG min=0;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(min);
|
||||
if (n <= 0 || inc_x <= 0) return(min);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf[0] = ABS(x[ix]);
|
||||
minf[1] = ABS(x[ix+1]);
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < CABS1(minf,0) )
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf[0] = ABS(x[ix]);
|
||||
minf[1] = ABS(x[ix+1]);
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
|
|
@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG ix=0;
|
||||
FLOAT maxf=0.0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(maxf);
|
||||
if (n <= 0 || inc_x <= 0) return(maxf);
|
||||
|
||||
maxf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG ix=0;
|
||||
FLOAT minf=0.0;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(minf);
|
||||
if (n <= 0 || inc_x <= 0) return(minf);
|
||||
|
||||
minf=x[0];
|
||||
ix += inc_x;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
FLOAT absxi = 0.0;
|
||||
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(0.0);
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
if ( n == 1 ) return( ABS(x[0]) );
|
||||
|
||||
n *= inc_x;
|
||||
|
|
|
@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT maxf[2];
|
||||
BLASLONG max=0;
|
||||
FLOAT maxf;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(0.0);
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
maxf[0] = ABS(x[ix]);
|
||||
maxf[1] = ABS(x[ix+1]);
|
||||
maxf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) > CABS1(maxf,0) )
|
||||
if( CABS1(x,ix) > maxf )
|
||||
{
|
||||
max = i;
|
||||
maxf[0] = ABS(x[ix]);
|
||||
maxf[1] = ABS(x[ix+1]);
|
||||
maxf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return(CABS1(maxf,0));
|
||||
return(maxf);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
{
|
||||
BLASLONG i=0;
|
||||
BLASLONG ix=0;
|
||||
FLOAT minf[2];
|
||||
BLASLONG min=0;
|
||||
FLOAT minf;
|
||||
BLASLONG inc_x2;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(0.0);
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
minf[0] = ABS(x[ix]);
|
||||
minf[1] = ABS(x[ix+1]);
|
||||
minf = CABS1(x,0);
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
|
||||
while(i < n)
|
||||
{
|
||||
if( CABS1(x,ix) < CABS1(minf,0) )
|
||||
if( CABS1(x,ix) < minf )
|
||||
{
|
||||
min = i;
|
||||
minf[0] = ABS(x[ix]);
|
||||
minf[1] = ABS(x[ix+1]);
|
||||
minf = CABS1(x,ix);
|
||||
}
|
||||
ix += inc_x2;
|
||||
i++;
|
||||
}
|
||||
return(CABS1(minf,0));
|
||||
return(minf);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG i=0;
|
||||
FLOAT sumf = 0.0;
|
||||
BLASLONG inc_x2;
|
||||
if (n < 0 || inc_x < 1 ) return(sumf);
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return(sumf);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
|
|
|
@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
|
|||
BLASLONG i=0;
|
||||
BLASLONG ix,iy;
|
||||
FLOAT temp;
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
|
||||
if ( n < 0 ) return(0);
|
||||
if ( n <= 0 ) return(0);
|
||||
|
||||
ix = 0;
|
||||
iy = 0;
|
||||
|
|
|
@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
|||
BLASLONG inc_x2;
|
||||
FLOAT temp;
|
||||
|
||||
if (n < 0 || inc_x < 1 ) return(0.0);
|
||||
if (n <= 0 || inc_x <= 0) return(0.0);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
include $(KERNELDIR)/KERNEL.ARMV8
|
||||
|
||||
SAMAXKERNEL = amax.S
|
||||
DAMAXKERNEL = amax.S
|
||||
CAMAXKERNEL = zamax.S
|
||||
ZAMAXKERNEL = zamax.S
|
||||
|
||||
ISAMAXKERNEL = isamax.S
|
||||
IDAMAXKERNEL = idamax.S
|
||||
ICAMAXKERNEL = izamax.S
|
||||
IZAMAXKERNEL = izamax.S
|
||||
|
||||
SASUMKERNEL = asum.S
|
||||
DASUMKERNEL = asum.S
|
||||
CASUMKERNEL = casum.S
|
||||
ZASUMKERNEL = zasum.S
|
||||
|
||||
SAXPYKERNEL = axpy.S
|
||||
DAXPYKERNEL = axpy.S
|
||||
CAXPYKERNEL = zaxpy.S
|
||||
ZAXPYKERNEL = zaxpy.S
|
||||
|
||||
SCOPYKERNEL = copy.S
|
||||
DCOPYKERNEL = copy.S
|
||||
CCOPYKERNEL = copy.S
|
||||
ZCOPYKERNEL = copy.S
|
||||
|
||||
DOTKERNEL = dot.S
|
||||
DDOTKERNEL = dot.S
|
||||
CDOTKERNEL = zdot.S
|
||||
ZDOTKERNEL = zdot.S
|
||||
|
||||
SNRM2KERNEL = snrm2.S
|
||||
DNRM2KERNEL = dnrm2.S
|
||||
CNRM2KERNEL = znrm2.S
|
||||
ZNRM2KERNEL = znrm2.S
|
||||
|
||||
SROTKERNEL = rot.S
|
||||
DROTKERNEL = rot.S
|
||||
CROTKERNEL = zrot.S
|
||||
ZROTKERNEL = zrot.S
|
||||
|
||||
SCALKERNEL = scal.S
|
||||
DSCALKERNEL = scal.S
|
||||
CSCALKERNEL = zscal.S
|
||||
ZSCALKERNEL = zscal.S
|
||||
|
||||
SSWAPKERNEL = swap.S
|
||||
DSWAPKERNEL = swap.S
|
||||
CSWAPKERNEL = swap.S
|
||||
ZSWAPKERNEL = swap.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n.S
|
||||
DGEMVNKERNEL = gemv_n.S
|
||||
CGEMVNKERNEL = zgemv_n.S
|
||||
ZGEMVNKERNEL = zgemv_n.S
|
||||
|
||||
SGEMVTKERNEL = gemv_t.S
|
||||
DGEMVTKERNEL = gemv_t.S
|
||||
CGEMVTKERNEL = zgemv_t.S
|
||||
ZGEMVTKERNEL = zgemv_t.S
|
||||
|
||||
STRMMKERNEL = strmm_kernel_4x4.S
|
||||
DTRMMKERNEL = dtrmm_kernel_4x4.S
|
||||
CTRMMKERNEL = ctrmm_kernel_4x4.S
|
||||
ZTRMMKERNEL = ztrmm_kernel_4x4.S
|
||||
|
||||
SGEMMKERNEL = sgemm_kernel_4x4.S
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
DGEMMKERNEL = dgemm_kernel_4x4.S
|
||||
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = cgemm_kernel_4x4.S
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = zgemm_kernel_4x4.S
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
|
@ -0,0 +1,249 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(USE_MIN)
|
||||
#define COND le
|
||||
#else
|
||||
#define COND ge
|
||||
#endif
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define MAXF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define MAXF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_F1
|
||||
ldr MAXF, [X], #SZ
|
||||
#if defined(USE_ABS)
|
||||
fabs MAXF, MAXF
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
#if defined(USE_ABS)
|
||||
fabs TMPF, TMPF
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
.macro INIT_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
#if defined(USE_ABS)
|
||||
fabs v0.4s, v0.4s
|
||||
#endif
|
||||
#if defined(USE_MIN)
|
||||
fminv MAXF, v0.4s
|
||||
#else
|
||||
fmaxv MAXF, v0.4s
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld2 {v0.2d,v1.2d}, [X], #32
|
||||
#if defined(USE_ABS)
|
||||
fabs v0.2d, v0.2d
|
||||
fabs v1.2d, v1.2d
|
||||
#endif
|
||||
#if defined(USE_MIN)
|
||||
fmin v0.2d, v0.2d, v1.2d
|
||||
fminp MAXF, v0.2d
|
||||
#else
|
||||
fmax v0.2d, v0.2d, v1.2d
|
||||
fmaxp MAXF, v0.2d
|
||||
#endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s}, [X], #16
|
||||
#if defined(USE_ABS)
|
||||
fabs v1.4s, v1.4s
|
||||
#endif
|
||||
#if defined(USE_MIN)
|
||||
fminv TMPF, v1.4s
|
||||
#else
|
||||
fmaxv TMPF, v1.4s
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld2 {v1.2d,v2.2d}, [X], #32
|
||||
#if defined(USE_ABS)
|
||||
fabs v1.2d, v1.2d
|
||||
fabs v2.2d, v2.2d
|
||||
#endif
|
||||
#if defined(USE_MIN)
|
||||
fmin v1.2d, v1.2d, v2.2d
|
||||
fminp TMPF, v1.2d
|
||||
#else
|
||||
fmax v1.2d, v1.2d, v2.2d
|
||||
fmaxp TMPF, v1.2d
|
||||
#endif
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
ld1 {v0.s}[0], [X], INC_X
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v0.d}[0], [X], INC_X
|
||||
#endif
|
||||
#if defined(USE_ABS)
|
||||
fabs MAXF, MAXF
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
#if defined(USE_ABS)
|
||||
fabs TMPF, TMPF
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble amax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble amax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne amax_kernel_S_BEGIN
|
||||
|
||||
amax_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq amax_kernel_F1_INIT
|
||||
|
||||
INIT_F4
|
||||
subs I, I, #1
|
||||
beq amax_kernel_F1
|
||||
|
||||
amax_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F4
|
||||
|
||||
amax_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F10
|
||||
|
||||
ret
|
||||
|
||||
amax_kernel_F1_INIT:
|
||||
|
||||
INIT_F1
|
||||
subs N, N, #1
|
||||
b amax_kernel_F1
|
||||
|
||||
amax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble amax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble amax_kernel_S1
|
||||
|
||||
amax_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S4
|
||||
|
||||
amax_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S10
|
||||
|
||||
amax_kernel_L999:
|
||||
|
||||
ret
|
||||
|
||||
amax_kernel_zero:
|
||||
|
||||
fmov MAXF, REG0
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,194 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
|
||||
fabs v1.4s, v1.4s // ABS() each value
|
||||
fabs v2.4s, v2.4s // ABS() each value
|
||||
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
|
||||
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
|
||||
add X, X, #64
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fabs v5.2d, v5.2d
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v2.2d, v2.2d, v3.2d
|
||||
fadd v4.2d, v4.2d, v5.2d
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v4.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
#else
|
||||
faddp SUMF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fabs TMPF, TMPF
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
#if !defined(DOUBLE)
|
||||
fmov s1, SUMF
|
||||
#else
|
||||
fmov d1, SUMF
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble asum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble asum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne asum_kernel_S_BEGIN
|
||||
|
||||
asum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq asum_kernel_F1
|
||||
|
||||
asum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
asum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F10
|
||||
|
||||
asum_kernel_L999:
|
||||
ret
|
||||
|
||||
asum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble asum_kernel_S1
|
||||
|
||||
asum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S4
|
||||
|
||||
asum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,209 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x3 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define Y x5 /* Y vector address */
|
||||
#define INC_Y x6 /* Y stride */
|
||||
#define I x1 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define DA s0 /* scale input value */
|
||||
#define TMPX s1
|
||||
#define TMPVX {v1.s}[0]
|
||||
#define TMPY s2
|
||||
#define TMPVY {v2.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define DA d0 /* scale input value */
|
||||
#define TMPX d1
|
||||
#define TMPVX {v1.d}[0]
|
||||
#define TMPY d2
|
||||
#define TMPVY {v2.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
ldr TMPX, [X], #SZ
|
||||
ldr TMPY, [Y]
|
||||
fmadd TMPY, TMPX, DA, TMPY
|
||||
str TMPY, [Y], #SZ
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s}, [X], #16
|
||||
ld1 {v2.4s}, [Y]
|
||||
fmla v2.4s, v1.4s, v0.s[0]
|
||||
st1 {v2.4s}, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v1.2d, v2.2d}, [X], #32
|
||||
ld1 {v3.2d, v4.2d}, [Y]
|
||||
fmla v3.2d, v1.2d, v0.d[0]
|
||||
fmla v4.2d, v2.2d, v0.d[0]
|
||||
st1 {v3.2d, v4.2d}, [Y], #32
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X], #32
|
||||
ld1 {v3.4s, v4.4s}, [Y]
|
||||
|
||||
fmla v3.4s, v1.4s, v0.s[0]
|
||||
fmla v4.4s, v2.4s, v0.s[0]
|
||||
|
||||
st1 {v3.4s, v4.4s}, [Y], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y]
|
||||
|
||||
fmla v16.2d, v1.2d, v0.d[0]
|
||||
fmla v17.2d, v2.2d, v0.d[0]
|
||||
fmla v18.2d, v3.2d, v0.d[0]
|
||||
fmla v19.2d, v4.2d, v0.d[0]
|
||||
|
||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
PRFM PLDL1KEEP, [Y, #512]
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
lsl INC_Y, INC_Y, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
ld1 TMPVX, [X], INC_X
|
||||
ldr TMPY, [Y]
|
||||
fmadd TMPY, TMPX, DA, TMPY
|
||||
st1 TMPVY, [Y], INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble axpy_kernel_L999
|
||||
|
||||
fcmp DA, #0.0
|
||||
beq axpy_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne axpy_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne axpy_kernel_S_BEGIN
|
||||
|
||||
axpy_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq axpy_kernel_F1
|
||||
|
||||
axpy_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne axpy_kernel_F8
|
||||
|
||||
axpy_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble axpy_kernel_L999
|
||||
|
||||
axpy_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne axpy_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
axpy_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble axpy_kernel_S1
|
||||
|
||||
axpy_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne axpy_kernel_S4
|
||||
|
||||
axpy_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble axpy_kernel_L999
|
||||
|
||||
axpy_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne axpy_kernel_S10
|
||||
|
||||
axpy_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
|
@ -0,0 +1,170 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 wzr
|
||||
#define SUMF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2s}, [X], #8
|
||||
fabs v1.2s, v1.2s
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
|
||||
add X, X, #64
|
||||
fabs v1.4s, v1.4s
|
||||
fabs v2.4s, v2.4s
|
||||
fabs v3.4s, v3.4s
|
||||
fabs v4.4s, v4.4s
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fadd v3.4s, v3.4s, v4.4s
|
||||
fadd v0.4s, v0.4s, v1.4s
|
||||
fadd v0.4s, v0.4s, v3.4s
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8_FINALIZE
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SUMF, v0.2s
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
fabs v1.2s, v1.2s
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, TMPF, s2
|
||||
fadd SUMF, SUMF, TMPF
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
fmov s1, SUMF
|
||||
|
||||
cmp N, xzr
|
||||
ble asum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble asum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne asum_kernel_S_BEGIN
|
||||
|
||||
asum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq asum_kernel_F1
|
||||
|
||||
asum_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F8
|
||||
|
||||
KERNEL_F8_FINALIZE
|
||||
|
||||
asum_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F10
|
||||
|
||||
asum_kernel_L999:
|
||||
ret
|
||||
|
||||
asum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble asum_kernel_S1
|
||||
|
||||
asum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S4
|
||||
|
||||
asum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,232 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define Y x3 /* Y vector address */
|
||||
#define INC_Y x4 /* Y stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define TMPF s0
|
||||
#define TMPVF {v0.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define TMPF d0
|
||||
#define TMPVF {v0.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
ldr TMPF, [X], #SZ
|
||||
str TMPF, [Y], #SZ
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X], #8
|
||||
st1 {v0.2s}, [Y], #8
|
||||
#else
|
||||
ld1 {v0.2d}, [X], #16
|
||||
st1 {v0.2d}, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
#endif
|
||||
#else // COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s}, [X], #16
|
||||
ld1 {v1.4s}, [X], #16
|
||||
ld1 {v2.4s}, [X], #16
|
||||
ld1 {v3.4s}, [X], #16
|
||||
st1 {v0.4s}, [Y], #16
|
||||
st1 {v1.4s}, [Y], #16
|
||||
st1 {v2.4s}, [Y], #16
|
||||
st1 {v3.4s}, [Y], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
lsl INC_Y, INC_Y, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
lsl INC_Y, INC_Y, #4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ldr w10, [X]
|
||||
add X, X, INC_X
|
||||
str w10, [Y]
|
||||
add Y, Y, INC_Y
|
||||
#else
|
||||
ldr x10, [X]
|
||||
add X, X, INC_X
|
||||
str x10, [Y]
|
||||
add Y, Y, INC_Y
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X]
|
||||
add X, X, INC_X
|
||||
st1 {v0.2s}, [Y]
|
||||
add Y, Y, INC_Y
|
||||
#else
|
||||
ld1 {v0.2d}, [X]
|
||||
add X, X, INC_X
|
||||
st1 {v0.2d}, [Y]
|
||||
add Y, Y, INC_Y
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble copy_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne copy_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne copy_kernel_S_BEGIN
|
||||
|
||||
copy_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq copy_kernel_F1
|
||||
|
||||
copy_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne copy_kernel_F4
|
||||
|
||||
copy_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble copy_kernel_L999
|
||||
|
||||
copy_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne copy_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
copy_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble copy_kernel_S1
|
||||
|
||||
copy_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne copy_kernel_S4
|
||||
|
||||
copy_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble copy_kernel_L999
|
||||
|
||||
copy_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne copy_kernel_S10
|
||||
|
||||
copy_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,169 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define TMPF d6
|
||||
#define SSQ d0
|
||||
#define TMPVF {v6.d}[0]
|
||||
#define SZ 8
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fmul TMPF, TMPF, TMPF
|
||||
fadd SSQ, SSQ, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
ld1 {v1.2d, v2.2d}, [X], #32
|
||||
fmla v0.2d, v1.2d, v1.2d
|
||||
fmla v5.2d, v2.2d, v2.2d
|
||||
ld1 {v3.2d, v4.2d}, [X], #32
|
||||
fmla v0.2d, v3.2d, v3.2d
|
||||
fmla v5.2d, v4.2d, v4.2d
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro nrm2_kernel_F8_FINALIZE
|
||||
fadd v0.2d, v0.2d, v5.2d
|
||||
faddp SSQ, v0.2d
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fmul SSQ, TMPF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fmul TMPF, TMPF, TMPF
|
||||
fadd SSQ, SSQ, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SSQ, xzr
|
||||
fmov d5, SSQ
|
||||
|
||||
cmp N, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, #1
|
||||
bne nrm2_kernel_S_BEGIN
|
||||
|
||||
nrm2_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq nrm2_kernel_F1_INIT
|
||||
|
||||
nrm2_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F8
|
||||
|
||||
nrm2_kernel_F8_FINALIZE
|
||||
|
||||
nrm2_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F10
|
||||
|
||||
b nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F1_INIT:
|
||||
|
||||
b nrm2_kernel_F1
|
||||
|
||||
nrm2_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble nrm2_kernel_S1
|
||||
|
||||
nrm2_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S4
|
||||
|
||||
nrm2_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S10
|
||||
|
||||
nrm2_kernel_L999:
|
||||
fsqrt SSQ, SSQ
|
||||
ret
|
||||
|
||||
nrm2_kernel_zero:
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,227 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define Y x3 /* Y vector address */
|
||||
#define INC_Y x4 /* Y stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#if !defined(DSDOT)
|
||||
#define REG0 wzr
|
||||
#define DOTF s0
|
||||
#else // DSDOT
|
||||
#define REG0 xzr
|
||||
#define DOTF d0
|
||||
#endif
|
||||
#define DOTI s1
|
||||
#define TMPX s2
|
||||
#define LD1VX {v2.s}[0]
|
||||
#define TMPY s3
|
||||
#define LD1VY {v3.s}[0]
|
||||
#define TMPVY v3.s[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define DOTF d0
|
||||
#define DOTI d1
|
||||
#define TMPX d2
|
||||
#define LD1VX {v2.d}[0]
|
||||
#define TMPY d3
|
||||
#define LD1VY {v3.d}[0]
|
||||
#define TMPVY v3.d[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPX, [X], #SZ
|
||||
ldr TMPY, [Y], #SZ
|
||||
#if !defined(DSDOT)
|
||||
fmadd DOTF, TMPX, TMPY, DOTF
|
||||
#else // DSDOT
|
||||
fmul TMPX, TMPX, TMPY
|
||||
fcvt d2, TMPX
|
||||
fadd DOTF, DOTF, d2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s}, [X], #16
|
||||
ld1 {v3.4s}, [Y], #16
|
||||
#if !defined(DSDOT)
|
||||
fmla v0.4s, v2.4s, v3.4s
|
||||
#else
|
||||
fmul v2.4s, v2.4s, v3.4s
|
||||
ext v3.16b, v2.16b, v2.16b, #8
|
||||
fcvtl v2.2d, v2.2s
|
||||
fcvtl v3.2d, v3.2s
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
#endif
|
||||
#else //DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [X], #32
|
||||
ld1 {v4.2d, v5.2d}, [Y], #32
|
||||
fmul v2.2d, v2.2d, v4.2d
|
||||
fmul v3.2d, v3.2d, v5.2d
|
||||
fadd v0.2d, v0.2d, v2.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
PRFM PLDL1KEEP, [Y, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
#if !defined(DSDOT)
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp DOTF, v0.2s
|
||||
#else
|
||||
faddp DOTF, v0.2d
|
||||
#endif
|
||||
#else //DOUBLE
|
||||
faddp DOTF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
lsl INC_Y, INC_Y, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 LD1VX, [X], INC_X
|
||||
ld1 LD1VY, [Y], INC_Y
|
||||
#if !defined(DSDOT)
|
||||
fmadd DOTF, TMPX, TMPY, DOTF
|
||||
#else // DSDOT
|
||||
fmul TMPX, TMPX, TMPY
|
||||
fcvt d2, TMPX
|
||||
fadd DOTF, DOTF, d2
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov DOTF, REG0
|
||||
#if defined(DOUBLE)
|
||||
fmov d6, DOTF
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble dot_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne dot_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne dot_kernel_S_BEGIN
|
||||
|
||||
dot_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq dot_kernel_F1
|
||||
|
||||
dot_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
dot_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble dot_kernel_L999
|
||||
|
||||
dot_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_F10
|
||||
|
||||
ret
|
||||
|
||||
dot_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble dot_kernel_S1
|
||||
|
||||
dot_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_S4
|
||||
|
||||
dot_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble dot_kernel_L999
|
||||
|
||||
dot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_S10
|
||||
|
||||
dot_kernel_L999:
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,320 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0 /* Y vector length */
|
||||
#define N x1 /* X vector length */
|
||||
#define A x3 /* A vector address */
|
||||
#define LDA x4 /* A stride */
|
||||
#define X x5 /* X vector address */
|
||||
#define INC_X x6 /* X stride */
|
||||
#define Y x7 /* Y vector address */
|
||||
#define INC_Y x2 /* Y stride */
|
||||
#define A_PTR x9 /* loop A vector address */
|
||||
#define Y_IPTR x10 /* loop Y vector address */
|
||||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
#define Y_OPTR x13 /* loop Y vector address */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define ALPHA s0
|
||||
#define TEMP s1
|
||||
#define TEMPV {v1.s}[0]
|
||||
#define TMP1 s2
|
||||
#define TMPV1 {v2.s}[0]
|
||||
#define TMP2 s3
|
||||
#define TMPV2 {v3.s}[0]
|
||||
#define SZ 4
|
||||
#define SHZ 2
|
||||
#else
|
||||
#define ALPHA d0
|
||||
#define TEMP d1
|
||||
#define TEMPV {v1.d}[0]
|
||||
#define TMP1 d2
|
||||
#define TMPV1 {v2.d}[0]
|
||||
#define TMP2 d3
|
||||
#define TMPV2 {v3.d}[0]
|
||||
#define SZ 8
|
||||
#define SHZ 3
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F16
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s, v3.4s}, [A_PTR], #32
|
||||
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
|
||||
fmla v4.4s, v1.4s, v2.4s
|
||||
fmla v5.4s, v1.4s, v3.4s
|
||||
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.4s, v7.4s}, [A_PTR], #32
|
||||
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
|
||||
fmla v8.4s, v1.4s, v6.4s
|
||||
fmla v9.4s, v1.4s, v7.4s
|
||||
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
|
||||
#else //DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [A_PTR], #32
|
||||
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
|
||||
fmla v4.2d, v1.2d, v2.2d
|
||||
fmla v5.2d, v1.2d, v3.2d
|
||||
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v6.2d, v7.2d}, [A_PTR], #32
|
||||
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
|
||||
fmla v8.2d, v1.2d, v6.2d
|
||||
fmla v9.2d, v1.2d, v7.2d
|
||||
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v10.2d, v11.2d}, [A_PTR], #32
|
||||
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
|
||||
fmla v12.2d, v1.2d, v10.2d
|
||||
fmla v13.2d, v1.2d, v11.2d
|
||||
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
|
||||
|
||||
ld1 {v14.2d, v15.2d}, [A_PTR], #32
|
||||
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
|
||||
fmla v16.2d, v1.2d, v14.2d
|
||||
fmla v17.2d, v1.2d, v15.2d
|
||||
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s}, [A_PTR], #16
|
||||
ld1 {v3.4s}, [Y_IPTR], #16
|
||||
fmla v3.4s, v1.4s, v2.4s
|
||||
st1 {v3.4s}, [Y_OPTR], #16
|
||||
#else
|
||||
ld1 {v2.2d}, [A_PTR], #16
|
||||
ld1 {v3.2d}, [Y_IPTR], #16
|
||||
fmla v3.2d, v1.2d, v2.2d
|
||||
st1 {v3.2d}, [Y_OPTR], #16
|
||||
|
||||
ld1 {v4.2d}, [A_PTR], #16
|
||||
ld1 {v5.2d}, [Y_IPTR], #16
|
||||
fmla v5.2d, v1.2d, v4.2d
|
||||
st1 {v5.2d}, [Y_OPTR], #16
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
ld1 TMPV1, [A_PTR], #SZ
|
||||
ld1 TMPV2, [Y_IPTR]
|
||||
fmadd TMP2, TEMP, TMP1, TMP2
|
||||
st1 TMPV2, [Y_IPTR], #SZ
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
lsl INC_Y, INC_Y, #SHZ
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
ld1 TMPV1, [A_PTR], #SZ
|
||||
ld1 TMPV2, [Y_IPTR]
|
||||
fmadd TMP2, TEMP, TMP1, TMP2
|
||||
st1 TMPV2, [Y_IPTR], INC_Y
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
ldr INC_Y, [sp]
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
cmp N, xzr
|
||||
ble gemv_n_kernel_L999
|
||||
cmp M, xzr
|
||||
ble gemv_n_kernel_L999
|
||||
|
||||
lsl LDA, LDA, #SHZ
|
||||
lsl INC_X, INC_X, #SHZ
|
||||
mov J, N
|
||||
|
||||
cmp INC_Y, #1
|
||||
bne gemv_n_kernel_S_BEGIN
|
||||
|
||||
gemv_n_kernel_F_LOOP:
|
||||
|
||||
ld1 TEMPV, [X], INC_X
|
||||
fmul TEMP, ALPHA, TEMP
|
||||
#if !defined(DOUBLE)
|
||||
ins v1.s[1], v1.s[0]
|
||||
ins v1.s[2], v1.s[0]
|
||||
ins v1.s[3], v1.s[0]
|
||||
#else
|
||||
ins v1.d[1], v1.d[0]
|
||||
#endif
|
||||
mov A_PTR, A
|
||||
mov Y_IPTR, Y
|
||||
mov Y_OPTR, Y
|
||||
|
||||
gemv_n_kernel_F32:
|
||||
|
||||
asr I, M, #5
|
||||
cmp I, xzr
|
||||
beq gemv_n_kernel_F4
|
||||
|
||||
gemv_n_kernel_F320:
|
||||
|
||||
KERNEL_F16
|
||||
KERNEL_F16
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_n_kernel_F320
|
||||
|
||||
gemv_n_kernel_F4:
|
||||
ands I, M, #31
|
||||
asr I, I, #2
|
||||
cmp I, xzr
|
||||
beq gemv_n_kernel_F1
|
||||
|
||||
gemv_n_kernel_F40:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_n_kernel_F40
|
||||
|
||||
gemv_n_kernel_F1:
|
||||
ands I, M, #3
|
||||
ble gemv_n_kernel_F_END
|
||||
|
||||
gemv_n_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_n_kernel_F10
|
||||
|
||||
gemv_n_kernel_F_END:
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne gemv_n_kernel_F_LOOP
|
||||
|
||||
b gemv_n_kernel_L999
|
||||
|
||||
gemv_n_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
gemv_n_kernel_S_LOOP:
|
||||
|
||||
ld1 TEMPV, [X], INC_X
|
||||
fmul TEMP, ALPHA, TEMP
|
||||
mov A_PTR, A
|
||||
mov Y_IPTR, Y
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
ble gemv_n_kernel_S1
|
||||
|
||||
gemv_n_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_n_kernel_S4
|
||||
|
||||
gemv_n_kernel_S1:
|
||||
|
||||
ands I, M, #3
|
||||
ble gemv_n_kernel_S_END
|
||||
|
||||
gemv_n_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_n_kernel_S10
|
||||
|
||||
gemv_n_kernel_S_END:
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne gemv_n_kernel_S_LOOP
|
||||
|
||||
gemv_n_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
|
||||
RESTORE_REGS
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,347 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0 /* Y vector length */
|
||||
#define N x1 /* X vector length */
|
||||
#define A x3 /* A vector address */
|
||||
#define LDA x4 /* A stride */
|
||||
#define X x5 /* X vector address */
|
||||
#define INC_X x6 /* X stride */
|
||||
#define Y x7 /* Y vector address */
|
||||
#define INC_Y x2 /* Y stride */
|
||||
#define A_PTR x9 /* loop A vector address */
|
||||
#define X_PTR x10 /* loop X vector address */
|
||||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define ALPHA s0
|
||||
#define TEMP s1
|
||||
#define TEMP1 s2
|
||||
#define TEMP2 s3
|
||||
#define TEMP3 s4
|
||||
#define TEMPV {v1.s}[0]
|
||||
#define TMP1 s2
|
||||
#define TMPV1 {v2.s}[0]
|
||||
#define TMP2 s3
|
||||
#define TMPV2 {v3.s}[0]
|
||||
#define SZ 4
|
||||
#define SHZ 2
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define ALPHA d0
|
||||
#define TEMP d1
|
||||
#define TEMP1 d2
|
||||
#define TEMP2 d3
|
||||
#define TEMP3 d4
|
||||
#define TEMPV {v1.d}[0]
|
||||
#define TMP1 d2
|
||||
#define TMPV1 {v2.d}[0]
|
||||
#define TMP2 d3
|
||||
#define TMPV2 {v3.d}[0]
|
||||
#define SZ 8
|
||||
#define SHZ 3
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F32
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
|
||||
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v5.4s, v9.4s
|
||||
fmla v2.4s, v6.4s, v10.4s
|
||||
fmla v3.4s, v7.4s, v11.4s
|
||||
fmla v4.4s, v8.4s, v12.4s
|
||||
|
||||
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
|
||||
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
|
||||
fmla v1.4s, v13.4s, v17.4s
|
||||
fmla v2.4s, v14.4s, v18.4s
|
||||
fmla v3.4s, v15.4s, v19.4s
|
||||
fmla v4.4s, v16.4s, v20.4s
|
||||
#else
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
|
||||
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
|
||||
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v5.2d, v9.2d
|
||||
fmla v2.2d, v6.2d, v10.2d
|
||||
fmla v3.2d, v7.2d, v11.2d
|
||||
fmla v4.2d, v8.2d, v12.2d
|
||||
|
||||
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
|
||||
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
|
||||
fmla v1.2d, v13.2d, v17.2d
|
||||
fmla v2.2d, v14.2d, v18.2d
|
||||
fmla v3.2d, v15.2d, v19.2d
|
||||
fmla v4.2d, v16.2d, v20.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F32_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
fadd v1.4s, v1.4s, v2.4s
|
||||
fadd v1.4s, v1.4s, v3.4s
|
||||
fadd v1.4s, v1.4s, v4.4s
|
||||
#else
|
||||
fadd v1.2d, v1.2d, v2.2d
|
||||
fadd v1.2d, v1.2d, v3.2d
|
||||
fadd v1.2d, v1.2d, v4.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s}, [A_PTR], #16
|
||||
ld1 {v3.4s}, [X_PTR], #16
|
||||
fmla v1.4s, v2.4s, v3.4s
|
||||
#else
|
||||
ld1 {v2.2d}, [A_PTR], #16
|
||||
ld1 {v3.2d}, [X_PTR], #16
|
||||
fmla v1.2d, v2.2d, v3.2d
|
||||
|
||||
ld1 {v4.2d}, [A_PTR], #16
|
||||
ld1 {v5.2d}, [X_PTR], #16
|
||||
fmla v1.2d, v4.2d, v5.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v2.16b, v1.16b, v1.16b, #8
|
||||
fadd v1.2s, v1.2s, v2.2s
|
||||
faddp TEMP, v1.2s
|
||||
#else
|
||||
faddp TEMP, v1.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 TMPV1, [A_PTR], #SZ
|
||||
ld1 TMPV2, [X_PTR], #SZ
|
||||
fmadd TEMP, TMP1, TMP2, TEMP
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #SHZ
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPV1, [A_PTR], #SZ
|
||||
ld1 TMPV2, [X_PTR], INC_X
|
||||
fmadd TEMP, TMP1, TMP2, TEMP
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
ldr INC_Y, [sp]
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
cmp N, xzr
|
||||
ble gemv_t_kernel_L999
|
||||
cmp M, xzr
|
||||
ble gemv_t_kernel_L999
|
||||
|
||||
lsl LDA, LDA, #SHZ
|
||||
lsl INC_Y, INC_Y, #SHZ
|
||||
mov J, N
|
||||
|
||||
cmp INC_X, #1
|
||||
bne gemv_t_kernel_S_BEGIN
|
||||
|
||||
gemv_t_kernel_F_LOOP:
|
||||
|
||||
fmov TEMP, REG0
|
||||
fmov TEMP1, REG0
|
||||
fmov TEMP2, REG0
|
||||
fmov TEMP3, REG0
|
||||
|
||||
mov A_PTR, A
|
||||
mov X_PTR, X
|
||||
|
||||
gemv_t_kernel_F32:
|
||||
|
||||
asr I, M, #5
|
||||
cmp I, xzr
|
||||
beq gemv_t_kernel_F4
|
||||
|
||||
gemv_t_kernel_F320:
|
||||
|
||||
KERNEL_F32
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_t_kernel_F320
|
||||
|
||||
KERNEL_F32_FINALIZE
|
||||
|
||||
gemv_t_kernel_F4:
|
||||
ands I, M, #31
|
||||
asr I, I, #2
|
||||
cmp I, xzr
|
||||
beq gemv_t_kernel_F1
|
||||
|
||||
gemv_t_kernel_F40:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_t_kernel_F40
|
||||
|
||||
gemv_t_kernel_F1:
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
ands I, M, #3
|
||||
ble gemv_t_kernel_F_END
|
||||
|
||||
gemv_t_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_t_kernel_F10
|
||||
|
||||
gemv_t_kernel_F_END:
|
||||
|
||||
ld1 TMPV1, [Y]
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
fmadd TMP1, ALPHA, TEMP, TMP1
|
||||
st1 TMPV1, [Y], INC_Y
|
||||
bne gemv_t_kernel_F_LOOP
|
||||
|
||||
b gemv_t_kernel_L999
|
||||
|
||||
gemv_t_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
gemv_t_kernel_S_LOOP:
|
||||
|
||||
fmov TEMP, REG0
|
||||
mov A_PTR, A
|
||||
mov X_PTR, X
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
ble gemv_t_kernel_S1
|
||||
|
||||
gemv_t_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_t_kernel_S4
|
||||
|
||||
gemv_t_kernel_S1:
|
||||
|
||||
ands I, M, #3
|
||||
ble gemv_t_kernel_S_END
|
||||
|
||||
gemv_t_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne gemv_t_kernel_S10
|
||||
|
||||
gemv_t_kernel_S_END:
|
||||
|
||||
ld1 TMPV1, [Y]
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
fmadd TMP1, ALPHA, TEMP, TMP1
|
||||
st1 TMPV1, [Y], INC_Y
|
||||
bne gemv_t_kernel_S_LOOP
|
||||
|
||||
gemv_t_kernel_L999:
|
||||
|
||||
RESTORE_REGS
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,124 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define INDEX x3 /* index of max/min value */
|
||||
#define Z x4 /* vector index */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(USE_MIN)
|
||||
#define COND le
|
||||
#else
|
||||
#define COND ge
|
||||
#endif
|
||||
|
||||
#define MAXF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v0.d}[0], [X], INC_X
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, MAXF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
add Z, Z, #1
|
||||
fabs TMPF, TMPF
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble iamax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_S1
|
||||
|
||||
iamax_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S4
|
||||
|
||||
iamax_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
|
||||
iamax_kernel_L999:
|
||||
|
||||
mov x0, INDEX
|
||||
ret
|
||||
|
||||
iamax_kernel_zero:
|
||||
|
||||
mov x0, xzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,213 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define INDEX x3 /* index of max/min value */
|
||||
#define Z x4 /* vector index */
|
||||
#define I x5 /* loop variable */
|
||||
#define X_COPY x6 /* copy of X address */
|
||||
#define MAXF_Z x7
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define MAXF s5
|
||||
#define TMPF s6
|
||||
#define TMPVF {v6.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_F1
|
||||
ldr MAXF, [X], #SZ
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, MAXF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
add Z, Z, #1
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel INDEX, INDEX, Z, le
|
||||
.endm
|
||||
|
||||
.macro INIT_F4
|
||||
ld1 {v0.4s}, [X], #16
|
||||
fabs v0.4s, v0.4s
|
||||
fmaxv MAXF, v0.4s
|
||||
mov Z, #5
|
||||
mov MAXF_Z, #1
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v0.4s}, [X], #16
|
||||
fabs v0.4s, v0.4s
|
||||
fmaxv TMPF, v0.4s
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel MAXF_Z, MAXF_Z, Z, le
|
||||
add Z, Z, #4
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
mov INDEX, MAXF_Z
|
||||
sub MAXF_Z, MAXF_Z, #1
|
||||
lsl MAXF_Z, MAXF_Z, #2
|
||||
add X_COPY, X_COPY, MAXF_Z
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
ldr TMPF, [X_COPY], #SZ
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
beq KERNEL_F4_FINALIZE_DONE
|
||||
add INDEX, INDEX, #1
|
||||
KERNEL_F4_FINALIZE_DONE:
|
||||
.endm
|
||||
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #2
|
||||
ld1 TMPVF, [X], INC_X
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs MAXF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
add Z, Z, #1
|
||||
fabs TMPF, TMPF
|
||||
fcmp TMPF, MAXF
|
||||
fcsel MAXF, MAXF, TMPF, le
|
||||
csel INDEX, INDEX, Z, le
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble iamax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
PRFM PLDL1KEEP, [X]
|
||||
mov X_COPY, X
|
||||
|
||||
cmp INC_X, #1
|
||||
bne iamax_kernel_S_BEGIN
|
||||
|
||||
iamax_kernel_F_BEGIN:
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq iamax_kernel_F1_INIT
|
||||
|
||||
INIT_F4
|
||||
subs I, I, #1
|
||||
beq iamax_kernel_F4_FINALIZE
|
||||
|
||||
iamax_kernel_F4:
|
||||
KERNEL_F4
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F4
|
||||
|
||||
iamax_kernel_F4_FINALIZE:
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
iamax_kernel_F1:
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F10:
|
||||
KERNEL_F1
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_F10
|
||||
b iamax_kernel_L999
|
||||
|
||||
iamax_kernel_F1_INIT:
|
||||
INIT_F1
|
||||
subs N, N, #1
|
||||
b iamax_kernel_F1
|
||||
|
||||
iamax_kernel_S_BEGIN:
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_S1
|
||||
|
||||
iamax_kernel_S4:
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S4
|
||||
|
||||
iamax_kernel_S1:
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S10:
|
||||
KERNEL_S1
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
|
||||
iamax_kernel_L999:
|
||||
mov x0, INDEX
|
||||
ret
|
||||
|
||||
iamax_kernel_zero:
|
||||
mov x0, xzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,151 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define INDEX x3 /* index of max/min value */
|
||||
#define Z x4 /* vector index */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(USE_MIN)
|
||||
#define COND le
|
||||
#else
|
||||
#define COND ge
|
||||
#endif
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define MAXF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define MAXF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v0.2s}, [X], INC_X
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs v0.2s, v0.2s
|
||||
ext v1.8b, v0.8b, v0.8b, #4
|
||||
fadd MAXF, s0, s1
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
ld1 {v0.2d}, [X], INC_X
|
||||
mov Z, #1
|
||||
mov INDEX, Z
|
||||
fabs v0.2d, v0.2d
|
||||
faddp MAXF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
add Z, Z, #1
|
||||
fabs v1.2s, v1.2s
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, s1, s2
|
||||
#else
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
add Z, Z, #1
|
||||
fabs v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
csel INDEX, INDEX, Z, COND
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble iamax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble iamax_kernel_zero
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble iamax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble iamax_kernel_S1
|
||||
|
||||
iamax_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S4
|
||||
|
||||
iamax_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble iamax_kernel_L999
|
||||
|
||||
iamax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne iamax_kernel_S10
|
||||
|
||||
iamax_kernel_L999:
|
||||
|
||||
mov x0, INDEX
|
||||
ret
|
||||
|
||||
iamax_kernel_zero:
|
||||
|
||||
mov x0, xzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,243 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define Y x3 /* Y vector address */
|
||||
#define INC_Y x4 /* Y stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define C s0 /* scale input value */
|
||||
#define S s1 /* scale input value */
|
||||
#else
|
||||
#define C d0 /* scale input value */
|
||||
#define S d1 /* scale input value */
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // [C, C]
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // [C, C]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_F1
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, S
|
||||
ins v1.s[1], v2.s[0] // [-S, S]
|
||||
#else
|
||||
fneg d2, S
|
||||
ins v1.d[1], v2.d[0] // [-S, S]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.s}[0], [X]
|
||||
ld1 {v2.s}[1], [Y] // [Y, X]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
|
||||
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
|
||||
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
|
||||
st1 {v4.s}[0], [X], #4
|
||||
st1 {v4.s}[1], [Y], #4
|
||||
#else
|
||||
ld1 {v2.d}[0], [X]
|
||||
ld1 {v2.d}[1], [Y] // [Y, X]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
|
||||
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
|
||||
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
|
||||
st1 {v4.d}[0], [X], #8
|
||||
st1 {v4.d}[1], [Y], #8
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F4
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.d[1], v0.d[0] // [C, C, C, C]
|
||||
ins v1.s[1], v1.s[0]
|
||||
ins v1.d[1], v1.d[0] // [S, S, S, S]
|
||||
#else
|
||||
ins v1.d[1], v1.d[0] // [S, S]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s}, [X]
|
||||
fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
|
||||
ld1 {v3.4s}, [Y]
|
||||
fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0
|
||||
st1 {v4.4s}, [X], #16
|
||||
fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0
|
||||
fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
|
||||
st1 {v5.4s}, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [X]
|
||||
fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0
|
||||
fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2
|
||||
ld1 {v4.2d, v5.2d}, [Y]
|
||||
fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0
|
||||
fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2
|
||||
st1 {v6.2d, v7.2d}, [X], #32
|
||||
fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0
|
||||
fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2
|
||||
fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0
|
||||
fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2
|
||||
st1 {v16.2d, v17.2d}, [Y], #32
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
PRFM PLDL1KEEP, [Y, #512]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
lsl INC_Y, INC_Y, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.s}[0], [X]
|
||||
ld1 {v2.s}[1], [Y] // [Y, X]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
|
||||
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
|
||||
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
|
||||
st1 {v4.s}[0], [X], INC_X
|
||||
st1 {v4.s}[1], [Y], INC_Y
|
||||
#else
|
||||
ld1 {v2.d}[0], [X]
|
||||
ld1 {v2.d}[1], [Y] // [Y, X]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
|
||||
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
|
||||
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
|
||||
st1 {v4.d}[0], [X], INC_X
|
||||
st1 {v4.d}[1], [Y], INC_Y
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble rot_kernel_L999
|
||||
|
||||
INIT
|
||||
|
||||
cmp INC_X, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
|
||||
rot_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq rot_kernel_F1
|
||||
|
||||
KERNEL_INIT_F4
|
||||
|
||||
rot_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_F4
|
||||
|
||||
rot_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble rot_kernel_L999
|
||||
|
||||
INIT_F1
|
||||
|
||||
rot_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
rot_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
INIT_F1
|
||||
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble rot_kernel_S1
|
||||
|
||||
rot_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S4
|
||||
|
||||
rot_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble rot_kernel_L999
|
||||
|
||||
|
||||
rot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S10
|
||||
|
||||
rot_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
|
@ -0,0 +1,253 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x3 /* X vector address */
|
||||
#define X_COPY x5 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define I x1 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define DA s0 /* scale input value */
|
||||
#define DAV {v0.s}[0]
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define DA d0 /* scale input value */
|
||||
#define DAV {v0.d}[0]
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
ldr TMPF, [X]
|
||||
fmul TMPF, TMPF, DA
|
||||
str TMPF, [X], #SZ
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F8
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0]
|
||||
ins v0.s[2], v0.s[0]
|
||||
ins v0.s[3], v0.s[0]
|
||||
#else
|
||||
ins v0.d[1], v0.d[0]
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X]
|
||||
fmul v1.4s, v1.4s, v0.4s
|
||||
fmul v2.4s, v2.4s, v0.4s
|
||||
st1 {v1.4s, v2.4s}, [X], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X]
|
||||
fmul v1.2d, v1.2d, v0.2d
|
||||
fmul v2.2d, v2.2d, v0.2d
|
||||
fmul v3.2d, v3.2d, v0.2d
|
||||
fmul v4.2d, v4.2d, v0.2d
|
||||
st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ldr TMPF, [X]
|
||||
fmul TMPF, TMPF, DA
|
||||
st1 TMPVF, [X], INC_X
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S4
|
||||
#if !defined(DOUBLE)
|
||||
ldr s1, [X]
|
||||
add X, X, INC_X
|
||||
fmul s1, s1, s0
|
||||
str s1, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr s2, [X]
|
||||
add X, X, INC_X
|
||||
fmul s2, s2, s0
|
||||
str s2, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr s3, [X]
|
||||
add X, X, INC_X
|
||||
fmul s3, s3, s0
|
||||
str s3, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr s4, [X]
|
||||
add X, X, INC_X
|
||||
fmul s4, s4, s0
|
||||
str s4, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
#else
|
||||
ldr d1, [X]
|
||||
add X, X, INC_X
|
||||
fmul d1, d1, d0
|
||||
str d1, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr d2, [X]
|
||||
add X, X, INC_X
|
||||
fmul d2, d2, d0
|
||||
str d2, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr d3, [X]
|
||||
add X, X, INC_X
|
||||
fmul d3, d3, d0
|
||||
str d3, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
|
||||
ldr d4, [X]
|
||||
add X, X, INC_X
|
||||
fmul d4, d4, d0
|
||||
str d4, [X_COPY]
|
||||
add X_COPY, X_COPY, INC_X
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble scal_kernel_L999
|
||||
|
||||
fcmp DA, #0.0
|
||||
beq scal_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne scal_kernel_S_BEGIN
|
||||
|
||||
scal_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq scal_kernel_F1
|
||||
|
||||
KERNEL_INIT_F8
|
||||
|
||||
scal_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne scal_kernel_F8
|
||||
|
||||
scal_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble scal_kernel_L999
|
||||
|
||||
scal_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne scal_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
scal_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
mov X_COPY, X
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble scal_kernel_S1
|
||||
|
||||
scal_kernel_S4:
|
||||
|
||||
KERNEL_S4
|
||||
|
||||
subs I, I, #1
|
||||
bne scal_kernel_S4
|
||||
|
||||
scal_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble scal_kernel_L999
|
||||
|
||||
scal_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne scal_kernel_S10
|
||||
|
||||
scal_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
scal_kernel_zero:
|
||||
|
||||
INIT_S
|
||||
|
||||
scal_kernel_Z1:
|
||||
|
||||
st1 DAV, [X], INC_X
|
||||
subs N, N, #1
|
||||
bne scal_kernel_Z1
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,178 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define TMPF s6
|
||||
#define SSQ s0
|
||||
#define TMPVF {v6.s}[0]
|
||||
#define SZ 4
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fmul SSQ, TMPF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
ldr TMPF, [X], #SZ
|
||||
fmul TMPF, TMPF, TMPF
|
||||
fadd SSQ, SSQ, TMPF
|
||||
.endm
|
||||
|
||||
.macro INIT_F4
|
||||
ld1 {v1.4s}, [X], #16
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
ext v2.16b, v1.16b, v1.16b, #8
|
||||
fadd v2.2s, v1.2s, v2.2s
|
||||
faddp SSQ, v2.2s
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v1.4s}, [X], #16
|
||||
fmul v1.4s, v1.4s, v1.4s
|
||||
ext v2.16b, v1.16b, v1.16b, #8
|
||||
fadd v2.2s, v1.2s, v2.2s
|
||||
faddp TMPF, v2.2s
|
||||
fadd SSQ, SSQ, TMPF
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #2
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fmul SSQ, TMPF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 TMPVF, [X], INC_X
|
||||
fmul TMPF, TMPF, TMPF
|
||||
fadd SSQ, SSQ, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, #1
|
||||
bne nrm2_kernel_S_BEGIN
|
||||
|
||||
nrm2_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq nrm2_kernel_F1_INIT
|
||||
|
||||
INIT_F4
|
||||
subs I, I, #1
|
||||
beq nrm2_kernel_F1
|
||||
|
||||
nrm2_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F4
|
||||
|
||||
nrm2_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F10
|
||||
|
||||
b nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F1_INIT:
|
||||
INIT_F1
|
||||
subs N, N, #1
|
||||
b nrm2_kernel_F1
|
||||
|
||||
nrm2_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble nrm2_kernel_S1
|
||||
|
||||
nrm2_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S4
|
||||
|
||||
nrm2_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S10
|
||||
|
||||
nrm2_kernel_L999:
|
||||
fsqrt SSQ, SSQ
|
||||
ret
|
||||
|
||||
nrm2_kernel_zero:
|
||||
fmov SSQ, wzr
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,266 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x3 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define Y x5 /* Y vector address */
|
||||
#define INC_Y x6 /* Y stride */
|
||||
#define I x1 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define TMP0 s0
|
||||
#define TMPV0 {v0.s}[0]
|
||||
#define TMP1 s1
|
||||
#define TMPV1 {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define TMP0 d0
|
||||
#define TMPV0 {v0.d}[0]
|
||||
#define TMP1 d1
|
||||
#define TMPV1 {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
ldr TMP0, [X]
|
||||
ldr TMP1, [Y]
|
||||
str TMP0, [Y], #SZ
|
||||
str TMP1, [X], #SZ
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X]
|
||||
ld1 {v1.2s}, [Y]
|
||||
st1 {v0.2s}, [Y], #8
|
||||
st1 {v1.2s}, [X], #8
|
||||
#else
|
||||
ld1 {v0.2d}, [X]
|
||||
ld1 {v1.2d}, [Y]
|
||||
st1 {v0.2d}, [Y], #16
|
||||
st1 {v1.2d}, [X], #16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
#endif
|
||||
#else // COMPLEX
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
ld1 {v0.4s, v1.4s}, [X]
|
||||
ld1 {v2.4s, v3.4s}, [Y]
|
||||
st1 {v0.4s, v1.4s}, [Y], #32
|
||||
st1 {v2.4s, v3.4s}, [X], #32
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #2
|
||||
lsl INC_Y, INC_Y, #2
|
||||
#else
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
lsl INC_Y, INC_Y, #4
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(COMPLEX)
|
||||
#if !defined(DOUBLE)
|
||||
ldr w10, [X]
|
||||
ldr w11, [Y]
|
||||
str w10, [Y]
|
||||
str w11, [X]
|
||||
#else
|
||||
ldr x10, [X]
|
||||
ldr x11, [Y]
|
||||
str x10, [Y]
|
||||
str x11, [X]
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
ldr x10, [X]
|
||||
ldr x11, [Y]
|
||||
str x10, [Y]
|
||||
str x11, [X]
|
||||
#else
|
||||
ldr x10, [X]
|
||||
ldr x11, [Y]
|
||||
str x10, [Y]
|
||||
str x11, [X]
|
||||
|
||||
ldr x12, [X, #8]
|
||||
ldr x13, [Y, #8]
|
||||
str x12, [Y, #8]
|
||||
str x13, [X, #8]
|
||||
#endif
|
||||
#endif
|
||||
add Y, Y, INC_Y
|
||||
add X, X, INC_X
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble swap_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne swap_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne swap_kernel_S_BEGIN
|
||||
|
||||
swap_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq swap_kernel_F1
|
||||
|
||||
swap_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne swap_kernel_F8
|
||||
|
||||
swap_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble swap_kernel_L999
|
||||
|
||||
swap_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne swap_kernel_F10
|
||||
|
||||
b swap_kernel_L999
|
||||
|
||||
|
||||
swap_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble swap_kernel_S1
|
||||
|
||||
swap_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne swap_kernel_S4
|
||||
|
||||
swap_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble swap_kernel_L999
|
||||
|
||||
swap_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne swap_kernel_S10
|
||||
|
||||
swap_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,273 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if defined(USE_MIN)
|
||||
#define COND le
|
||||
#else
|
||||
#define COND ge
|
||||
#endif
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define REG0 wzr
|
||||
#define MAXF s0
|
||||
#define TMPF s1
|
||||
#define TMPVF {v1.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define MAXF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v0.2s}, [X], #8
|
||||
fabs v0.2s, v0.2s
|
||||
ext v1.8b, v0.8b, v0.8b, #4
|
||||
fadd MAXF, s0, s1
|
||||
#else
|
||||
ld1 {v0.2d}, [X], #16
|
||||
fabs v0.2d, v0.2d
|
||||
faddp MAXF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], #8
|
||||
fabs v1.2s, v1.2s
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, s1, s2
|
||||
#else
|
||||
ld1 {v1.2d}, [X], #16
|
||||
fabs v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
.macro INIT_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld2 {v0.4s,v1.4s}, [X], #32
|
||||
fabs v0.4s, v0.4s // [X6, X4, X2, X0]
|
||||
fabs v1.4s, v1.4s // [X7, X5, X3, X1]
|
||||
fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
|
||||
#if defined(USE_MIN)
|
||||
fminv MAXF, v0.4s
|
||||
#else
|
||||
fmaxv MAXF, v0.4s
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64
|
||||
fabs v0.2d, v0.2d
|
||||
fabs v1.2d, v1.2d
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v2.2d, v2.2d, v3.2d
|
||||
#if defined(USE_MIN)
|
||||
fmin v0.2d, v0.2d, v2.2d
|
||||
fminp MAXF, v0.2d
|
||||
#else
|
||||
fmax v0.2d, v0.2d, v2.2d
|
||||
fmaxp MAXF, v0.2d
|
||||
#endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
ld2 {v1.4s,v2.4s}, [X], #32
|
||||
fabs v1.4s, v1.4s // [X6, X4, X2, X0]
|
||||
fabs v2.4s, v2.4s // [X7, X5, X3, X1]
|
||||
fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
|
||||
#if defined(USE_MIN)
|
||||
fminv TMPF, v1.4s
|
||||
#else
|
||||
fmaxv TMPF, v1.4s
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64
|
||||
fabs v1.2d, v1.2d
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
fadd v1.2d, v1.2d, v2.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
#if defined(USE_MIN)
|
||||
fmin v1.2d, v1.2d, v3.2d
|
||||
fminp MAXF, v1.2d
|
||||
#else
|
||||
fmax v1.2d, v1.2d, v3.2d
|
||||
fmaxp MAXF, v1.2d
|
||||
#endif
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v0.2s}, [X], INC_X
|
||||
fabs v0.2s, v0.2s
|
||||
ext v1.8b, v0.8b, v0.8b, #4
|
||||
fadd MAXF, s0, s1
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
ld1 {v0.2d}, [X], INC_X
|
||||
fabs v0.2d, v0.2d
|
||||
faddp MAXF, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
fabs v1.2s, v1.2s
|
||||
ext v2.8b, v1.8b, v1.8b, #4
|
||||
fadd TMPF, s1, s2
|
||||
#else
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
fabs v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
#endif
|
||||
fcmp MAXF, TMPF
|
||||
fcsel MAXF, MAXF, TMPF, COND
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble amax_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble amax_kernel_zero
|
||||
|
||||
cmp INC_X, #1
|
||||
bne amax_kernel_S_BEGIN
|
||||
|
||||
amax_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq amax_kernel_F1_INIT
|
||||
|
||||
INIT_F4
|
||||
subs I, I, #1
|
||||
beq amax_kernel_F1
|
||||
|
||||
amax_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F4
|
||||
|
||||
amax_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_F10
|
||||
|
||||
ret
|
||||
|
||||
amax_kernel_F1_INIT:
|
||||
|
||||
INIT_F1
|
||||
subs N, N, #1
|
||||
b amax_kernel_F1
|
||||
|
||||
amax_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble amax_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble amax_kernel_S1
|
||||
|
||||
amax_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S4
|
||||
|
||||
amax_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble amax_kernel_L999
|
||||
|
||||
amax_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne amax_kernel_S10
|
||||
|
||||
amax_kernel_L999:
|
||||
|
||||
ret
|
||||
|
||||
amax_kernel_zero:
|
||||
|
||||
fmov MAXF, REG0
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,164 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#define REG0 xzr
|
||||
#define SUMF d0
|
||||
#define TMPF d1
|
||||
#define TMPVF {v1.d}[0]
|
||||
#define SZ 8
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
ld1 {v1.2d}, [X], #16
|
||||
fabs v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
|
||||
fabs v1.2d, v1.2d
|
||||
fabs v2.2d, v2.2d
|
||||
fabs v3.2d, v3.2d
|
||||
fabs v4.2d, v4.2d
|
||||
|
||||
fadd v1.2d, v1.2d, v2.2d
|
||||
fadd v3.2d, v3.2d, v4.2d
|
||||
|
||||
fadd v0.2d, v0.2d, v1.2d
|
||||
fadd v0.2d, v0.2d, v3.2d
|
||||
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
faddp SUMF, v0.2d
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #4
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
fabs v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
fadd SUMF, SUMF, TMPF
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov SUMF, REG0
|
||||
|
||||
cmp N, xzr
|
||||
ble asum_kernel_L999
|
||||
cmp INC_X, xzr
|
||||
ble asum_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne asum_kernel_S_BEGIN
|
||||
|
||||
asum_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq asum_kernel_F1
|
||||
|
||||
asum_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
asum_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_F10
|
||||
|
||||
asum_kernel_L999:
|
||||
ret
|
||||
|
||||
asum_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble asum_kernel_S1
|
||||
|
||||
asum_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S4
|
||||
|
||||
asum_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble asum_kernel_L999
|
||||
|
||||
asum_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne asum_kernel_S10
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,301 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x3 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define Y x5 /* Y vector address */
|
||||
#define INC_Y x6 /* Y stride */
|
||||
#define I x1 /* loop variable */
|
||||
#define Y_COPY x7 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define DA_R s0 /* scale input value */
|
||||
#define DA_I s1 /* scale input value */
|
||||
#define TMPX v2.2s
|
||||
#define TMPY v3.2s
|
||||
#define SZ 4
|
||||
#else
|
||||
#define DA_R d0 /* scale input value */
|
||||
#define DA_I d1 /* scale input value */
|
||||
#define TMPX v2.2d
|
||||
#define TMPY v3.2d
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
fneg s2, DA_I
|
||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
|
||||
fneg d2, DA_I
|
||||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
|
||||
#endif
|
||||
#else
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, DA_R
|
||||
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
|
||||
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
|
||||
#else
|
||||
fneg d2, DA_R
|
||||
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
|
||||
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
|
||||
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v3.2s}, [Y], #8
|
||||
#else
|
||||
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
|
||||
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
|
||||
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v3.2d}, [Y], #16
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
// Replicate the lower 2 floats into the upper 2 slots
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
|
||||
// V3 = X[7], X[6], X[5], X[4]
|
||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
|
||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
|
||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
|
||||
|
||||
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
|
||||
// V5 = Y[7], Y[6], Y[5], Y[4]
|
||||
|
||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
|
||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
|
||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
|
||||
|
||||
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v4.4s}, [Y], #16
|
||||
|
||||
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
|
||||
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v5.4s}, [Y], #16
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
|
||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
|
||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
|
||||
|
||||
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
|
||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
|
||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
|
||||
|
||||
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
|
||||
|
||||
fmla v16.2d, v0.2d, v2.2d
|
||||
fmla v17.2d, v0.2d, v3.2d
|
||||
|
||||
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
|
||||
|
||||
fmla v16.2d, v1.2d, v20.2d
|
||||
fmla v17.2d, v1.2d, v21.2d
|
||||
st1 {v16.2d,v17.2d}, [Y], #32
|
||||
|
||||
fmla v18.2d, v0.2d, v4.2d
|
||||
fmla v19.2d, v0.2d, v5.2d
|
||||
fmla v18.2d, v1.2d, v22.2d
|
||||
fmla v19.2d, v1.2d, v23.2d
|
||||
st1 {v18.2d,v19.2d}, [Y], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #512]
|
||||
PRFM PLDL1KEEP, [Y, #512]
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
lsl INC_Y, INC_Y, #4
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
|
||||
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
|
||||
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v3.2s}, [Y], INC_Y
|
||||
#else
|
||||
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
|
||||
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
|
||||
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
|
||||
// Y[iy+1] += +-DA_R * X[ix+1]
|
||||
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
|
||||
// Y[iy+1] += DA_I * X[ix]
|
||||
st1 {v3.2d}, [Y], INC_Y
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble zaxpy_kernel_L999
|
||||
|
||||
mov Y_COPY, Y
|
||||
|
||||
fcmp DA_R, #0.0
|
||||
bne .L1
|
||||
fcmp DA_I, #0.0
|
||||
beq zaxpy_kernel_L999
|
||||
|
||||
.L1:
|
||||
INIT
|
||||
|
||||
cmp INC_X, #1
|
||||
bne zaxpy_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne zaxpy_kernel_S_BEGIN
|
||||
|
||||
zaxpy_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq zaxpy_kernel_F1
|
||||
|
||||
KERNEL_INIT_F4
|
||||
|
||||
zaxpy_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne zaxpy_kernel_F4
|
||||
|
||||
zaxpy_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble zaxpy_kernel_L999
|
||||
|
||||
zaxpy_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne zaxpy_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
zaxpy_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble zaxpy_kernel_S1
|
||||
|
||||
zaxpy_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zaxpy_kernel_S4
|
||||
|
||||
zaxpy_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble zaxpy_kernel_L999
|
||||
|
||||
zaxpy_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zaxpy_kernel_S10
|
||||
|
||||
zaxpy_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
|
@ -0,0 +1,302 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define Y x3 /* Y vector address */
|
||||
#define INC_Y x4 /* Y stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#if !defined(DSDOT)
|
||||
#define REG0 wzr
|
||||
#define DOTF s0
|
||||
#else // DSDOT
|
||||
#define REG0 xzr
|
||||
#define DOTF d0
|
||||
#endif
|
||||
#define DOTI s1
|
||||
#define TMPX s2
|
||||
#define LD1VX {v2.s}[0]
|
||||
#define TMPY s3
|
||||
#define LD1VY {v3.s}[0]
|
||||
#define TMPVY v3.s[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define REG0 xzr
|
||||
#define DOTF d0
|
||||
#define DOTI d1
|
||||
#define TMPX d2
|
||||
#define LD1VX {v2.d}[0]
|
||||
#define TMPY d3
|
||||
#define LD1VY {v3.d}[0]
|
||||
#define TMPVY v3.d[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2
|
||||
ins v4.s[0], v2.s[1] // V4 = X[ix+1]
|
||||
#if !defined(CONJ)
|
||||
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
|
||||
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#else
|
||||
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
|
||||
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2
|
||||
ins v4.d[0], v2.d[1] // V4 = X[ix+1]
|
||||
#if !defined(CONJ)
|
||||
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
|
||||
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#else
|
||||
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
|
||||
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2
|
||||
|
||||
fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy]
|
||||
fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1]
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
PRFM PLDL1KEEP, [Y, #1024]
|
||||
#if !defined(CONJ)
|
||||
fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy]
|
||||
#else
|
||||
fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy]
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld2 {v16.2d, v17.2d}, [Y], #32
|
||||
|
||||
fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy]
|
||||
fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1]
|
||||
ld2 {v4.2d, v5.2d}, [X], #32
|
||||
ld2 {v18.2d, v19.2d}, [Y], #32
|
||||
fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1]
|
||||
fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1]
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
PRFM PLDL1KEEP, [Y, #1024]
|
||||
#if !defined(CONJ)
|
||||
fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy]
|
||||
fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy]
|
||||
#else
|
||||
fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy]
|
||||
fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v2.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v2.2s
|
||||
faddp DOTF, v0.2s
|
||||
ext v3.16b, v1.16b, v1.16b, #8
|
||||
fadd v1.2s, v1.2s, v3.2s
|
||||
faddp DOTI, v1.2s
|
||||
#else
|
||||
fadd v0.2d, v0.2d, v20.2d
|
||||
faddp DOTF, v0.2d
|
||||
fadd v1.2d, v1.2d, v21.2d
|
||||
faddp DOTI, v1.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
lsl INC_Y, INC_Y, #4
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
|
||||
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
|
||||
#if !defined(CONJ)
|
||||
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
|
||||
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#else
|
||||
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
|
||||
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#endif
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
|
||||
ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
|
||||
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
|
||||
#if !defined(CONJ)
|
||||
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
|
||||
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
|
||||
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#else
|
||||
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
|
||||
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
|
||||
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
|
||||
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
fmov DOTF, REG0
|
||||
fmov DOTI, DOTF
|
||||
#if !defined(DOUBLE)
|
||||
fmov s20, DOTF
|
||||
fmov s21, DOTI
|
||||
#else
|
||||
fmov d20, DOTF
|
||||
fmov d21, DOTI
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble dot_kernel_L999
|
||||
|
||||
cmp INC_X, #1
|
||||
bne dot_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne dot_kernel_S_BEGIN
|
||||
|
||||
dot_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq dot_kernel_F1
|
||||
|
||||
dot_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
dot_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble dot_kernel_L999
|
||||
|
||||
dot_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_F10
|
||||
|
||||
ret
|
||||
|
||||
dot_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble dot_kernel_S1
|
||||
|
||||
dot_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_S4
|
||||
|
||||
dot_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble dot_kernel_L999
|
||||
|
||||
dot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne dot_kernel_S10
|
||||
|
||||
dot_kernel_L999:
|
||||
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,514 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0 /* Y vector length */
|
||||
#define N x1 /* X vector length */
|
||||
#define A x3 /* A vector address */
|
||||
#define LDA x4 /* A stride */
|
||||
#define X x5 /* X vector address */
|
||||
#define INC_X x6 /* X stride */
|
||||
#define Y x7 /* Y vector address */
|
||||
#define INC_Y x2 /* Y stride */
|
||||
#define A_PTR x9 /* loop A vector address */
|
||||
#define Y_IPTR x10 /* loop Y vector address */
|
||||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
#define Y_OPTR x13 /* loop Y vector address */
|
||||
#define X_PTR x14 /* loop X vector address */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define ALPHA_R s0
|
||||
#define ALPHA_I s1
|
||||
#define ALPHA_R_COPY s7
|
||||
#define ALPHA_I_COPY s8
|
||||
#define SHZ 3
|
||||
#else
|
||||
#define ALPHA_R d0
|
||||
#define ALPHA_I d1
|
||||
#define ALPHA_R_COPY d7
|
||||
#define ALPHA_I_COPY d8
|
||||
#define SHZ 4
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
|
||||
.macro INIT
|
||||
/********** INIT FOR F4 LOOP **********/
|
||||
fmov ALPHA_R_COPY, ALPHA_R
|
||||
fmov ALPHA_I_COPY, ALPHA_I
|
||||
#if !defined(DOUBLE)
|
||||
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
|
||||
ins v7.d[1], v7.d[0]
|
||||
ins v8.d[1], v8.d[0]
|
||||
#else
|
||||
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
|
||||
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
|
||||
#endif
|
||||
|
||||
/******* INIT FOR F1 AND S1 LOOP ******/
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
|
||||
fneg s2, ALPHA_I
|
||||
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
|
||||
#if !defined(XCONJ)
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
|
||||
#endif
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
|
||||
fneg d2, ALPHA_I
|
||||
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
|
||||
#if !defined(XCONJ)
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
|
||||
#endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_LOOP
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.s[0], v9.s[1]
|
||||
ins v9.s[1], v9.s[0] // [R(X), R(X)]
|
||||
ins v10.s[1], v10.s[0] // [I(X), I(X)]
|
||||
ins v9.d[1], v9.d[0]
|
||||
ins v10.d[1], v10.d[0]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
|
||||
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
|
||||
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
|
||||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
|
||||
fmul v2.2s, v0.2s, v2.2s
|
||||
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
|
||||
ins v3.s[0], v2.s[1]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fneg s4, s3
|
||||
ins v3.s[1], v4.s[0]
|
||||
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
|
||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg s4, s3
|
||||
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
|
||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
|
||||
fneg s4, s2
|
||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg s3, s3
|
||||
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
|
||||
fneg s4, s2
|
||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
/********** INIT_LOOP FOR F4 LOOP **********/
|
||||
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ins v10.d[0], v9.d[1]
|
||||
ins v9.d[1], v9.d[0] // [R(X), R(X)]
|
||||
ins v10.d[1], v10.d[0] // [I(X), I(X)]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
|
||||
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
|
||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
|
||||
#else
|
||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
|
||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
|
||||
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
|
||||
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
|
||||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
|
||||
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
|
||||
fmul v2.2d, v0.2d, v2.2d
|
||||
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
|
||||
ins v3.d[0], v2.d[1] // I(TEMP)
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fneg d4, d3 // -I(TEMP)
|
||||
ins v3.d[1], v4.d[0]
|
||||
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
|
||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg d4, d3 // -I(TEMP)
|
||||
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
|
||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
|
||||
fneg d4, d2 // -R(TEMP)
|
||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
|
||||
#else
|
||||
fneg d3, d3 // -I(TEMP)
|
||||
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
|
||||
fneg d4, d2 // -R(TEMP)
|
||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
#endif // DOUBLE
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
|
||||
|
||||
#else // DOUBLE
|
||||
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
|
||||
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
|
||||
|
||||
ld2 {v17.2d, v18.2d}, [A_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#else
|
||||
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
|
||||
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
|
||||
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
|
||||
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v4.2s}, [A_PTR], #8
|
||||
ld1 {v5.2s}, [Y_IPTR], #8
|
||||
ext v6.8b, v4.8b, v4.8b, #4
|
||||
fmla v5.2s, v2.2s, v4.2s
|
||||
fmla v5.2s, v3.2s, v6.2s
|
||||
st1 {v5.2s}, [Y_OPTR], #8
|
||||
#else // DOUBLE
|
||||
ld1 {v4.2d}, [A_PTR], #16
|
||||
ld1 {v5.2d}, [Y_IPTR], #16
|
||||
ext v6.16b, v4.16b, v4.16b, #8
|
||||
fmla v5.2d, v2.2d, v4.2d
|
||||
fmla v5.2d, v3.2d, v6.2d
|
||||
st1 {v5.2d}, [Y_OPTR], #16
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_Y, INC_Y, #SHZ
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v4.2s}, [A_PTR], #8
|
||||
ld1 {v5.2s}, [Y_IPTR], INC_Y
|
||||
ext v6.8b, v4.8b, v4.8b, #4
|
||||
fmla v5.2s, v2.2s, v4.2s
|
||||
fmla v5.2s, v3.2s, v6.2s
|
||||
st1 {v5.2s}, [Y_OPTR], INC_Y
|
||||
#else // DOUBLE
|
||||
ld1 {v4.2d}, [A_PTR], #16
|
||||
ld1 {v5.2d}, [Y_IPTR], INC_Y
|
||||
ext v6.16b, v4.16b, v4.16b, #8
|
||||
fmla v5.2d, v2.2d, v4.2d
|
||||
fmla v5.2d, v3.2d, v6.2d
|
||||
st1 {v5.2d}, [Y_OPTR], INC_Y
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
ldr INC_Y, [sp]
|
||||
|
||||
SAVE_REGS
|
||||
|
||||
cmp N, xzr
|
||||
ble zgemv_n_kernel_L999
|
||||
cmp M, xzr
|
||||
ble zgemv_n_kernel_L999
|
||||
|
||||
lsl LDA, LDA, #SHZ
|
||||
lsl INC_X, INC_X, #SHZ
|
||||
mov J, N
|
||||
|
||||
INIT
|
||||
|
||||
cmp INC_Y, #1
|
||||
bne zgemv_n_kernel_S_BEGIN
|
||||
|
||||
zgemv_n_kernel_F_LOOP:
|
||||
mov A_PTR, A
|
||||
mov Y_IPTR, Y
|
||||
mov Y_OPTR, Y
|
||||
mov X_PTR, X
|
||||
add X, X, INC_X
|
||||
INIT_LOOP
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
beq zgemv_n_kernel_F1
|
||||
|
||||
zgemv_n_kernel_F4:
|
||||
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_F4
|
||||
|
||||
zgemv_n_kernel_F1:
|
||||
|
||||
ands I, M, #3
|
||||
ble zgemv_n_kernel_F_END
|
||||
|
||||
zgemv_n_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_F10
|
||||
|
||||
zgemv_n_kernel_F_END:
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne zgemv_n_kernel_F_LOOP
|
||||
|
||||
b zgemv_n_kernel_L999
|
||||
|
||||
zgemv_n_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
zgemv_n_kernel_S_LOOP:
|
||||
mov A_PTR, A
|
||||
mov Y_IPTR, Y
|
||||
mov Y_OPTR, Y
|
||||
mov X_PTR, X
|
||||
add X, X, INC_X
|
||||
INIT_LOOP
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
ble zgemv_n_kernel_S1
|
||||
|
||||
zgemv_n_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_S4
|
||||
|
||||
zgemv_n_kernel_S1:
|
||||
|
||||
ands I, M, #3
|
||||
ble zgemv_n_kernel_S_END
|
||||
|
||||
zgemv_n_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_n_kernel_S10
|
||||
|
||||
zgemv_n_kernel_S_END:
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne zgemv_n_kernel_S_LOOP
|
||||
|
||||
zgemv_n_kernel_L999:
|
||||
RESTORE_REGS
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,448 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define M x0 /* Y vector length */
|
||||
#define N x1 /* X vector length */
|
||||
#define A x3 /* A vector address */
|
||||
#define LDA x4 /* A stride */
|
||||
#define X x5 /* X vector address */
|
||||
#define INC_X x6 /* X stride */
|
||||
#define Y x7 /* Y vector address */
|
||||
#define INC_Y x2 /* Y stride */
|
||||
#define A_PTR x9 /* loop A vector address */
|
||||
#define X_PTR x10 /* loop Y vector address */
|
||||
#define J x11 /* loop variable */
|
||||
#define I x12 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define ALPHA_R s0
|
||||
#define ALPHA_I s1
|
||||
#define ALPHA_R_COPY s7
|
||||
#define ALPHA_I_COPY s8
|
||||
#define SHZ 3
|
||||
#else
|
||||
#define ALPHA_R d0
|
||||
#define ALPHA_I d1
|
||||
#define ALPHA_R_COPY d7
|
||||
#define ALPHA_I_COPY d8
|
||||
#define SHZ 4
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
.macro SAVE_REGS
|
||||
add sp, sp, #-(11 * 16)
|
||||
stp d8, d9, [sp, #(0 * 16)]
|
||||
stp d10, d11, [sp, #(1 * 16)]
|
||||
stp d12, d13, [sp, #(2 * 16)]
|
||||
stp d14, d15, [sp, #(3 * 16)]
|
||||
stp d16, d17, [sp, #(4 * 16)]
|
||||
stp x18, x19, [sp, #(5 * 16)]
|
||||
stp x20, x21, [sp, #(6 * 16)]
|
||||
stp x22, x23, [sp, #(7 * 16)]
|
||||
stp x24, x25, [sp, #(8 * 16)]
|
||||
stp x26, x27, [sp, #(9 * 16)]
|
||||
str x28, [sp, #(10 * 16)]
|
||||
.endm
|
||||
|
||||
.macro RESTORE_REGS
|
||||
ldp d8, d9, [sp, #(0 * 16)]
|
||||
ldp d10, d11, [sp, #(1 * 16)]
|
||||
ldp d12, d13, [sp, #(2 * 16)]
|
||||
ldp d14, d15, [sp, #(3 * 16)]
|
||||
ldp d16, d17, [sp, #(4 * 16)]
|
||||
ldp x18, x19, [sp, #(5 * 16)]
|
||||
ldp x20, x21, [sp, #(6 * 16)]
|
||||
ldp x22, x23, [sp, #(7 * 16)]
|
||||
ldp x24, x25, [sp, #(8 * 16)]
|
||||
ldp x26, x27, [sp, #(9 * 16)]
|
||||
ldr x28, [sp, #(10 * 16)]
|
||||
add sp, sp, #(11*16)
|
||||
.endm
|
||||
|
||||
.macro INIT
|
||||
#if !defined(XCONJ)
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
|
||||
fneg s2, ALPHA_I
|
||||
ins v1.s[1], v2.s[0]
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
|
||||
fneg d2, ALPHA_I
|
||||
ins v1.d[1], v2.d[0]
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
|
||||
#endif
|
||||
#else // XCONJ
|
||||
#if !defined(DOUBLE)
|
||||
fneg s2, ALPHA_R
|
||||
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
|
||||
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
|
||||
#else
|
||||
fneg d2, ALPHA_R
|
||||
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
|
||||
#endif
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_LOOP
|
||||
fmov d9, xzr // TEMP_R = [0, 0]
|
||||
fmov d10, xzr // TEMP_I = [0, 0]
|
||||
#if !defined(DOUBLE)
|
||||
#else
|
||||
fmov d15, xzr // TEMP_R = [0, 0]
|
||||
fmov d16, xzr // TEMP_I = [0, 0]
|
||||
#endif
|
||||
|
||||
fmov d2, xzr // TEMP = [0, 0]
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
#if !defined(DOUBLE)
|
||||
|
||||
ld2 {v11.4s, v12.4s}, [X_PTR], #32
|
||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
|
||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
|
||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
|
||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
|
||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
|
||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
|
||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
|
||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
|
||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
|
||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
|
||||
#else // DOUBLE
|
||||
ld2 {v11.2d, v12.2d}, [X_PTR], #32
|
||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [X_PTR, #512]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
|
||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
|
||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
|
||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
|
||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
|
||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
|
||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
|
||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
|
||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
|
||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
ld2 {v17.2d, v18.2d}, [X_PTR], #32
|
||||
ld2 {v19.2d, v20.2d}, [A_PTR], #32
|
||||
prfm PLDL1STRM, [A_PTR, #512]
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#endif
|
||||
#else // CONJ
|
||||
#if !defined(XCONJ)
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#else
|
||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
|
||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
|
||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
|
||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
|
||||
#endif
|
||||
#endif // CONJ
|
||||
#endif //DOUBLE
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
ext v21.16b, v9.16b, v9.16b, #8
|
||||
fadd v9.2s, v9.2s, v21.2s
|
||||
faddp s9, v9.2s
|
||||
|
||||
ext v21.16b, v10.16b, v10.16b, #8
|
||||
fadd v10.2s, v10.2s, v21.2s
|
||||
faddp s10, v10.2s
|
||||
|
||||
ins v2.s[0], v9.s[0]
|
||||
ins v2.s[1], v10.s[0]
|
||||
#else
|
||||
fadd v9.2d, v9.2d, v15.2d
|
||||
fadd v10.2d, v10.2d, v16.2d
|
||||
|
||||
faddp d9, v9.2d
|
||||
faddp d10, v10.2d
|
||||
|
||||
ins v2.d[0], v9.d[0]
|
||||
ins v2.d[1], v10.d[0]
|
||||
#endif
|
||||
.endm
|
||||
|
||||
|
||||
.macro KERNEL_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
|
||||
ld1 {v5.s}[0], [A_PTR], #4 // A1
|
||||
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
|
||||
fneg s16, s5
|
||||
ins v5.s[1], v16.s[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
|
||||
#endif
|
||||
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
|
||||
fmla v2.2s, v4.2s, v6.2s
|
||||
fmla v2.2s, v5.2s, v7.2s
|
||||
#else // DOUBLE
|
||||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
|
||||
ld1 {v5.d}[0], [A_PTR], #8 // A1
|
||||
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
|
||||
fneg d16, d5
|
||||
ins v5.d[1], v16.d[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
|
||||
#endif
|
||||
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
|
||||
fmla v2.2d, v4.2d, v6.2d
|
||||
fmla v2.2d, v5.2d, v7.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
lsl INC_X, INC_X, #SHZ
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
|
||||
ld1 {v5.s}[0], [A_PTR], #4 // A1
|
||||
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
|
||||
fneg s16, s5
|
||||
ins v5.s[1], v16.s[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
|
||||
#endif
|
||||
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
|
||||
fmla v2.2s, v4.2s, v6.2s
|
||||
fmla v2.2s, v5.2s, v7.2s
|
||||
#else // DOUBLE
|
||||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
|
||||
ld1 {v5.d}[0], [A_PTR], #8 // A1
|
||||
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
|
||||
fneg d16, d5
|
||||
ins v5.d[1], v16.d[0] // [-A1, A1]
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
|
||||
#endif
|
||||
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
|
||||
fmla v2.2d, v4.2d, v6.2d
|
||||
fmla v2.2d, v5.2d, v7.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
ldr INC_Y, [sp]
|
||||
SAVE_REGS
|
||||
|
||||
cmp N, xzr
|
||||
ble zgemv_t_kernel_L999
|
||||
cmp M, xzr
|
||||
ble zgemv_t_kernel_L999
|
||||
|
||||
lsl LDA, LDA, #SHZ
|
||||
lsl INC_Y, INC_Y, #SHZ
|
||||
mov J, N
|
||||
|
||||
INIT
|
||||
|
||||
cmp INC_X, #1
|
||||
bne zgemv_t_kernel_S_BEGIN
|
||||
|
||||
zgemv_t_kernel_F_LOOP:
|
||||
|
||||
mov A_PTR, A
|
||||
mov X_PTR, X
|
||||
|
||||
INIT_LOOP
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
beq zgemv_t_kernel_F1
|
||||
|
||||
zgemv_t_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_t_kernel_F4
|
||||
|
||||
KERNEL_F4_FINALIZE
|
||||
|
||||
zgemv_t_kernel_F1:
|
||||
|
||||
ands I, M, #3
|
||||
ble zgemv_t_kernel_F_END
|
||||
|
||||
zgemv_t_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_t_kernel_F10
|
||||
|
||||
zgemv_t_kernel_F_END:
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v4.2s}, [Y]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
|
||||
fmla v4.2s, v0.2s, v2.2s
|
||||
fmla v4.2s, v1.2s, v3.2s
|
||||
st1 {v4.2s}, [Y], INC_Y
|
||||
#else // DOUBLE
|
||||
ld1 {v4.2d}, [Y]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
|
||||
fmla v4.2d, v0.2d, v2.2d
|
||||
fmla v4.2d, v1.2d, v3.2d
|
||||
st1 {v4.2d}, [Y], INC_Y
|
||||
#endif
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne zgemv_t_kernel_F_LOOP
|
||||
|
||||
b zgemv_t_kernel_L999
|
||||
|
||||
zgemv_t_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
zgemv_t_kernel_S_LOOP:
|
||||
|
||||
mov A_PTR, A
|
||||
mov X_PTR, X
|
||||
INIT_LOOP
|
||||
|
||||
asr I, M, #2
|
||||
cmp I, xzr
|
||||
ble zgemv_t_kernel_S1
|
||||
|
||||
zgemv_t_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_t_kernel_S4
|
||||
|
||||
zgemv_t_kernel_S1:
|
||||
|
||||
ands I, M, #3
|
||||
ble zgemv_t_kernel_S_END
|
||||
|
||||
zgemv_t_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zgemv_t_kernel_S10
|
||||
|
||||
zgemv_t_kernel_S_END:
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v4.2s}, [Y]
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
|
||||
fmla v4.2s, v0.2s, v2.2s
|
||||
fmla v4.2s, v1.2s, v3.2s
|
||||
st1 {v4.2s}, [Y], INC_Y
|
||||
#else // DOUBLE
|
||||
ld1 {v4.2d}, [Y]
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
|
||||
fmla v4.2d, v0.2d, v2.2d
|
||||
fmla v4.2d, v1.2d, v3.2d
|
||||
st1 {v4.2d}, [Y], INC_Y
|
||||
#endif
|
||||
|
||||
add A, A, LDA
|
||||
subs J, J, #1
|
||||
bne zgemv_t_kernel_S_LOOP
|
||||
|
||||
zgemv_t_kernel_L999:
|
||||
RESTORE_REGS
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,228 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define TMPF s6
|
||||
#define SSQ s0
|
||||
#define TMPVF {v6.s}[0]
|
||||
#define SZ 4
|
||||
#else
|
||||
#define TMPF d6
|
||||
#define SSQ d0
|
||||
#define TMPVF {v6.d}[0]
|
||||
#define SZ 8
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro KERNEL_F1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], #8
|
||||
fmul v1.2s, v1.2s, v1.2s
|
||||
faddp TMPF, v1.2s
|
||||
fadd SSQ, SSQ, TMPF
|
||||
#else
|
||||
ld1 {v1.2d}, [X], #16
|
||||
fmul v1.2d, v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
fadd SSQ, SSQ, TMPF
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F8
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.4s, v2.4s}, [X], #32
|
||||
fmla v0.4s, v1.4s, v1.4s
|
||||
fmla v5.4s, v2.4s, v2.4s
|
||||
ld1 {v3.4s,v4.4s}, [X], #32
|
||||
fmla v0.4s, v3.4s, v3.4s
|
||||
fmla v5.4s, v4.4s, v4.4s
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
#else // DOUBLE
|
||||
ld1 {v1.2d, v2.2d}, [X], #32
|
||||
fmla v0.2d, v1.2d, v1.2d
|
||||
fmla v5.2d, v2.2d, v2.2d
|
||||
ld1 {v3.2d, v4.2d}, [X], #32
|
||||
fmla v0.2d, v3.2d, v3.2d
|
||||
fmla v5.2d, v4.2d, v4.2d
|
||||
|
||||
ld1 {v16.2d, v17.2d}, [X], #32
|
||||
fmla v0.2d, v16.2d, v16.2d
|
||||
fmla v5.2d, v17.2d, v17.2d
|
||||
ld1 {v18.2d, v19.2d}, [X], #32
|
||||
fmla v0.2d, v18.2d, v18.2d
|
||||
fmla v5.2d, v19.2d, v19.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro nrm2_kernel_F8_FINALIZE
|
||||
#if !defined(DOUBLE)
|
||||
fadd v0.4s, v0.4s, v5.4s
|
||||
ext v1.16b, v0.16b, v0.16b, #8
|
||||
fadd v0.2s, v0.2s, v1.2s
|
||||
faddp SSQ, v0.2s
|
||||
#else
|
||||
fadd v0.2d, v0.2d, v5.2d
|
||||
faddp SSQ, v0.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
fmul v1.2s, v1.2s, v1.2s
|
||||
faddp SSQ, v1.2s
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
fmul v1.2d, v1.2d, v1.2d
|
||||
faddp SSQ, v1.2d
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v1.2s}, [X], INC_X
|
||||
fmul v1.2s, v1.2s, v1.2s
|
||||
faddp TMPF, v1.2s
|
||||
fadd SSQ, SSQ, TMPF
|
||||
#else
|
||||
ld1 {v1.2d}, [X], INC_X
|
||||
fmul v1.2d, v1.2d, v1.2d
|
||||
faddp TMPF, v1.2d
|
||||
fadd SSQ, SSQ, TMPF
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
fmov SSQ, wzr
|
||||
fmov s5, SSQ
|
||||
#else
|
||||
fmov SSQ, xzr
|
||||
fmov d5, SSQ
|
||||
#endif
|
||||
|
||||
cmp N, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, xzr
|
||||
ble nrm2_kernel_zero
|
||||
cmp INC_X, #1
|
||||
bne nrm2_kernel_S_BEGIN
|
||||
|
||||
nrm2_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #3
|
||||
cmp I, xzr
|
||||
beq nrm2_kernel_F1_INIT
|
||||
|
||||
nrm2_kernel_F8:
|
||||
|
||||
KERNEL_F8
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F8
|
||||
|
||||
nrm2_kernel_F8_FINALIZE
|
||||
|
||||
nrm2_kernel_F1:
|
||||
|
||||
ands I, N, #7
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_F10
|
||||
|
||||
b nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_F1_INIT:
|
||||
|
||||
b nrm2_kernel_F1
|
||||
|
||||
nrm2_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
subs N, N, #1
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble nrm2_kernel_S1
|
||||
|
||||
nrm2_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S4
|
||||
|
||||
nrm2_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble nrm2_kernel_L999
|
||||
|
||||
nrm2_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne nrm2_kernel_S10
|
||||
|
||||
nrm2_kernel_L999:
|
||||
fsqrt SSQ, SSQ
|
||||
ret
|
||||
|
||||
nrm2_kernel_zero:
|
||||
ret
|
||||
|
||||
EPILOGUE
|
|
@ -0,0 +1,256 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x1 /* X vector address */
|
||||
#define INC_X x2 /* X stride */
|
||||
#define Y x3 /* Y vector address */
|
||||
#define INC_Y x4 /* Y stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define C s0 /* scale input value */
|
||||
#define S s1 /* scale input value */
|
||||
#else
|
||||
#define C d0 /* scale input value */
|
||||
#define S d1 /* scale input value */
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // [C, C]
|
||||
ins v1.s[1], v1.s[0] // [S, S]
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // [C, C]
|
||||
ins v1.d[1], v1.d[0] // [S, S]
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X]
|
||||
ld1 {v3.2s}, [Y]
|
||||
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
|
||||
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
|
||||
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
|
||||
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
|
||||
st1 {v4.2s}, [X], #8
|
||||
st1 {v5.2s}, [Y], #8
|
||||
#else
|
||||
ld1 {v2.2d}, [X]
|
||||
ld1 {v3.2d}, [Y]
|
||||
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
|
||||
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
|
||||
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
|
||||
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
|
||||
st1 {v4.2d}, [X], #16
|
||||
st1 {v5.2d}, [Y], #16
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.d[1], v0.d[0] // [C, C, C, C]
|
||||
ins v1.d[1], v1.d[0] // [S, S, S, S]
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s, v3.4s}, [X]
|
||||
ld1 {v4.4s, v5.4s}, [Y]
|
||||
fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
|
||||
fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4
|
||||
fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0
|
||||
fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4
|
||||
fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0
|
||||
fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4
|
||||
fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
|
||||
fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4
|
||||
st1 {v6.4s,v7.4s}, [X], #32
|
||||
st1 {v16.4s,v17.4s}, [Y], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d, v3.2d}, [X]
|
||||
ld1 {v4.2d, v5.2d}, [Y]
|
||||
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
|
||||
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
|
||||
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
|
||||
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
|
||||
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
|
||||
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
|
||||
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
|
||||
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
|
||||
st1 {v6.2d,v7.2d}, [X], #32
|
||||
st1 {v16.2d,v17.2d}, [Y], #32
|
||||
ld1 {v2.2d, v3.2d}, [X]
|
||||
ld1 {v4.2d, v5.2d}, [Y]
|
||||
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
|
||||
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
|
||||
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
|
||||
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
|
||||
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
|
||||
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
|
||||
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
|
||||
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
|
||||
st1 {v6.2d,v7.2d}, [X], #32
|
||||
st1 {v16.2d,v17.2d}, [Y], #32
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
lsl INC_Y, INC_Y, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
lsl INC_Y, INC_Y, #4
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X]
|
||||
ld1 {v3.2s}, [Y]
|
||||
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
|
||||
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
|
||||
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
|
||||
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
|
||||
st1 {v4.2s}, [X], INC_X
|
||||
st1 {v5.2s}, [Y], INC_Y
|
||||
#else
|
||||
ld1 {v2.2d}, [X]
|
||||
ld1 {v3.2d}, [Y]
|
||||
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
|
||||
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
|
||||
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
|
||||
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
|
||||
st1 {v4.2d}, [X], INC_X
|
||||
st1 {v5.2d}, [Y], INC_Y
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble rot_kernel_L999
|
||||
|
||||
INIT
|
||||
|
||||
cmp INC_X, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
cmp INC_Y, #1
|
||||
bne rot_kernel_S_BEGIN
|
||||
|
||||
rot_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq rot_kernel_F1
|
||||
|
||||
KERNEL_INIT_F4
|
||||
|
||||
rot_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_F4
|
||||
|
||||
rot_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble rot_kernel_L999
|
||||
|
||||
rot_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
rot_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble rot_kernel_S1
|
||||
|
||||
rot_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S4
|
||||
|
||||
rot_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble rot_kernel_L999
|
||||
|
||||
rot_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne rot_kernel_S10
|
||||
|
||||
rot_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
|
@ -0,0 +1,274 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2015, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#define ASSEMBLER
|
||||
#include "common.h"
|
||||
|
||||
#define N x0 /* vector length */
|
||||
#define X x3 /* X vector address */
|
||||
#define INC_X x4 /* X stride */
|
||||
#define I x5 /* loop variable */
|
||||
|
||||
/*******************************************************************************
|
||||
* Macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define DA_R s0 /* real scale input value */
|
||||
#define DA_I s1 /* imaginary scale input value */
|
||||
#else
|
||||
#define DA_R d0 /* real scale input value */
|
||||
#define DA_I d1 /* imaginary scale input value */
|
||||
#endif
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
.macro INIT
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
|
||||
fneg s2, DA_I
|
||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
|
||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
|
||||
#else
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
|
||||
fneg d2, DA_I
|
||||
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
|
||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
|
||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
|
||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2s}, [X], #8
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
|
||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
|
||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2d}, [X], #16
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_INIT_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
// Replicate the lower 2 floats into the upper 2 slots
|
||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
|
||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_F4
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
|
||||
// V3 = X[7], X[6], X[5], X[4]
|
||||
|
||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
|
||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
|
||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
|
||||
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
|
||||
// X'[ix+1] += DA_R * X[ix+1]
|
||||
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
|
||||
// X'[ix+1] += DA_I * X[ix]
|
||||
|
||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
|
||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
|
||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
|
||||
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
|
||||
// X'[ix+1] += DA_R * X[ix+1]
|
||||
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
|
||||
// X'[ix+1] += DA_I * X[ix]
|
||||
|
||||
st1 {v2.4s,v3.4s}, [X], #32
|
||||
#else // DOUBLE
|
||||
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
|
||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
|
||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
|
||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
|
||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
|
||||
|
||||
fmul v2.2d, v0.2d, v2.2d
|
||||
fmla v2.2d, v1.2d, v20.2d
|
||||
|
||||
fmul v3.2d, v0.2d, v3.2d
|
||||
fmla v3.2d, v1.2d, v21.2d
|
||||
st1 {v2.2d,v3.2d}, [X], #32
|
||||
|
||||
fmul v4.2d, v0.2d, v4.2d
|
||||
fmla v4.2d, v1.2d, v22.2d
|
||||
|
||||
fmul v5.2d, v0.2d, v5.2d
|
||||
fmla v5.2d, v1.2d, v23.2d
|
||||
st1 {v4.2d,v5.2d}, [X], #32
|
||||
#endif
|
||||
PRFM PLDL1KEEP, [X, #1024]
|
||||
.endm
|
||||
|
||||
.macro INIT_S
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
lsl INC_X, INC_X, #3
|
||||
#else
|
||||
lsl INC_X, INC_X, #4
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
.macro KERNEL_S1
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
ld1 {v2.2s}, [X] // X1, X0
|
||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
|
||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
|
||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2s}, [X], INC_X
|
||||
#else
|
||||
ld1 {v2.2d}, [X] // X1, X0
|
||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
|
||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
|
||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
|
||||
st1 {v2.2d}, [X], INC_X
|
||||
#endif
|
||||
|
||||
.endm
|
||||
|
||||
/*******************************************************************************
|
||||
* End of macro definitions
|
||||
*******************************************************************************/
|
||||
|
||||
PROLOGUE
|
||||
|
||||
cmp N, xzr
|
||||
ble zscal_kernel_L999
|
||||
|
||||
fcmp DA_R, #0.0
|
||||
bne zscal_kernel_1
|
||||
|
||||
fcmp DA_I, #0.0
|
||||
beq zscal_kernel_zero
|
||||
|
||||
// TODO: special case DA_R == 0 && DA_I != 0
|
||||
|
||||
zscal_kernel_1:
|
||||
|
||||
// TODO: special case DA_R != 0 && DA_I == 0
|
||||
|
||||
INIT
|
||||
|
||||
cmp INC_X, #1
|
||||
bne zscal_kernel_S_BEGIN
|
||||
|
||||
zscal_kernel_F_BEGIN:
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
beq zscal_kernel_F1
|
||||
|
||||
KERNEL_INIT_F4
|
||||
|
||||
zscal_kernel_F4:
|
||||
|
||||
KERNEL_F4
|
||||
|
||||
subs I, I, #1
|
||||
bne zscal_kernel_F4
|
||||
|
||||
zscal_kernel_F1:
|
||||
|
||||
ands I, N, #3
|
||||
ble zscal_kernel_L999
|
||||
|
||||
zscal_kernel_F10:
|
||||
|
||||
KERNEL_F1
|
||||
|
||||
subs I, I, #1
|
||||
bne zscal_kernel_F10
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
zscal_kernel_S_BEGIN:
|
||||
|
||||
INIT_S
|
||||
|
||||
asr I, N, #2
|
||||
cmp I, xzr
|
||||
ble zscal_kernel_S1
|
||||
|
||||
zscal_kernel_S4:
|
||||
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zscal_kernel_S4
|
||||
|
||||
zscal_kernel_S1:
|
||||
|
||||
ands I, N, #3
|
||||
ble zscal_kernel_L999
|
||||
|
||||
zscal_kernel_S10:
|
||||
|
||||
KERNEL_S1
|
||||
|
||||
subs I, I, #1
|
||||
bne zscal_kernel_S10
|
||||
|
||||
zscal_kernel_L999:
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
zscal_kernel_zero:
|
||||
|
||||
INIT_S
|
||||
|
||||
zscal_kernel_Z1:
|
||||
|
||||
stp DA_R, DA_I, [X]
|
||||
add X, X, INC_X
|
||||
subs N, N, #1
|
||||
bne zscal_kernel_Z1
|
||||
|
||||
mov w0, wzr
|
||||
ret
|
||||
|
||||
EPILOGUE
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,883 @@
|
|||
#include "common.h"
|
||||
|
||||
#define MADD_ALPHA_N_STORE(C, res, alpha) \
|
||||
C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
|
||||
C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;
|
||||
|
||||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
||||
#define MADD(res, op1, op2) \
|
||||
res ## _r += op1 ## _r * op2 ## _r; \
|
||||
res ## _r -= op1 ## _i * op2 ## _i; \
|
||||
res ## _i += op1 ## _r * op2 ## _i; \
|
||||
res ## _i += op1 ## _i * op2 ## _r;
|
||||
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
||||
#define MADD(res, op1, op2) \
|
||||
res ## _r += op1 ## _r * op2 ## _r; \
|
||||
res ## _r += op1 ## _i * op2 ## _i; \
|
||||
res ## _i -= op1 ## _r * op2 ## _i; \
|
||||
res ## _i += op1 ## _i * op2 ## _r;
|
||||
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
||||
#define MADD(res, op1, op2) \
|
||||
res ## _r += op1 ## _r * op2 ## _r; \
|
||||
res ## _r += op1 ## _i * op2 ## _i; \
|
||||
res ## _i += op1 ## _r * op2 ## _i; \
|
||||
res ## _i -= op1 ## _i * op2 ## _r;
|
||||
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
||||
#define MADD(res, op1, op2) \
|
||||
res ## _r += op1 ## _r * op2 ## _r; \
|
||||
res ## _r -= op1 ## _i * op2 ## _i; \
|
||||
res ## _i -= op1 ## _r * op2 ## _i; \
|
||||
res ## _i -= op1 ## _i * op2 ## _r;
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
, BLASLONG offset
|
||||
)
|
||||
{
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
|
||||
FLOAT res00_r, res01_r, res02_r, res03_r;
|
||||
FLOAT res00_i, res01_i, res02_i, res03_i;
|
||||
FLOAT res10_r, res11_r, res12_r, res13_r;
|
||||
FLOAT res10_i, res11_i, res12_i, res13_i;
|
||||
FLOAT res20_r, res21_r, res22_r, res23_r;
|
||||
FLOAT res20_i, res21_i, res22_i, res23_i;
|
||||
FLOAT res30_r, res31_r, res32_r, res33_r;
|
||||
FLOAT res30_i, res31_i, res32_i, res33_i;
|
||||
FLOAT a0_r, a1_r;
|
||||
FLOAT a0_i, a1_i;
|
||||
FLOAT b0_r, b1_r, b2_r, b3_r;
|
||||
FLOAT b0_i, b1_i, b2_i, b3_i;
|
||||
BLASLONG off, temp;
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#endif
|
||||
|
||||
for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+2*ldc;
|
||||
C2 = C1+2*ldc;
|
||||
C3 = C2+2*ldc;
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x4
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
|
||||
ptrba += off*4*2; // number of values in A
|
||||
ptrbb = bb + off*4*2; // number of values in B
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
res02_r = 0;
|
||||
res02_i = 0;
|
||||
res03_r = 0;
|
||||
res03_i = 0;
|
||||
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
res11_r = 0;
|
||||
res11_i = 0;
|
||||
res12_r = 0;
|
||||
res12_i = 0;
|
||||
res13_r = 0;
|
||||
res13_i = 0;
|
||||
|
||||
res20_r = 0;
|
||||
res20_i = 0;
|
||||
res21_r = 0;
|
||||
res21_i = 0;
|
||||
res22_r = 0;
|
||||
res22_i = 0;
|
||||
res23_r = 0;
|
||||
res23_i = 0;
|
||||
|
||||
res30_r = 0;
|
||||
res30_i = 0;
|
||||
res31_r = 0;
|
||||
res31_i = 0;
|
||||
res32_r = 0;
|
||||
res32_i = 0;
|
||||
res33_r = 0;
|
||||
res33_i = 0;
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 4;
|
||||
#else
|
||||
temp = off + 4;
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
|
||||
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
MADD(res20, a0, b2);
|
||||
MADD(res30, a0, b3);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
MADD(res11, a1, b1);
|
||||
MADD(res21, a1, b2);
|
||||
MADD(res31, a1, b3);
|
||||
|
||||
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
|
||||
MADD(res02, a0, b0);
|
||||
MADD(res12, a0, b1);
|
||||
MADD(res22, a0, b2);
|
||||
MADD(res32, a0, b3);
|
||||
|
||||
|
||||
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
|
||||
MADD(res03, a1, b0);
|
||||
MADD(res13, a1, b1);
|
||||
MADD(res23, a1, b2);
|
||||
MADD(res33, a1, b3);
|
||||
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res02, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res03, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res11, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res12, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res13, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C2, res20, alpha);
|
||||
C2 = C2 + 2;
|
||||
MADD_ALPHA_N_STORE(C2, res21, alpha);
|
||||
C2 = C2 + 2;
|
||||
MADD_ALPHA_N_STORE(C2, res22, alpha);
|
||||
C2 = C2 + 2;
|
||||
MADD_ALPHA_N_STORE(C2, res23, alpha);
|
||||
C2 = C2 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C3, res30, alpha);
|
||||
C3 = C3 + 2;
|
||||
MADD_ALPHA_N_STORE(C3, res31, alpha);
|
||||
C3 = C3 + 2;
|
||||
MADD_ALPHA_N_STORE(C3, res32, alpha);
|
||||
C3 = C3 + 2;
|
||||
MADD_ALPHA_N_STORE(C3, res33, alpha);
|
||||
C3 = C3 + 2;
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#if defined(LEFT)
|
||||
temp = temp - 4;
|
||||
#else
|
||||
temp = temp - 4;
|
||||
#endif
|
||||
ptrba += temp*4*2; // number of values in A
|
||||
ptrbb += temp*4*2; // number of values in B
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x4 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb + off*4*2;
|
||||
#endif
|
||||
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
res11_r = 0;
|
||||
res11_i = 0;
|
||||
|
||||
res20_r = 0;
|
||||
res20_i = 0;
|
||||
res21_r = 0;
|
||||
res21_i = 0;
|
||||
|
||||
res30_r = 0;
|
||||
res30_i = 0;
|
||||
res31_r = 0;
|
||||
res31_i = 0;
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
|
||||
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
MADD(res20, a0, b2);
|
||||
MADD(res30, a0, b3);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
MADD(res11, a1, b1);
|
||||
MADD(res21, a1, b2);
|
||||
MADD(res31, a1, b3);
|
||||
|
||||
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res11, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C2, res20, alpha);
|
||||
C2 = C2 + 2;
|
||||
MADD_ALPHA_N_STORE(C2, res21, alpha);
|
||||
C2 = C2 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C3, res30, alpha);
|
||||
C3 = C3 + 2;
|
||||
MADD_ALPHA_N_STORE(C3, res31, alpha);
|
||||
C3 = C3 + 2;
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*4*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x4 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1*2;
|
||||
ptrbb = bb + off*4*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
res20_r = 0;
|
||||
res20_i = 0;
|
||||
res30_r = 0;
|
||||
res30_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
|
||||
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
MADD(res20, a0, b2);
|
||||
MADD(res30, a0, b3);
|
||||
|
||||
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+8;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C2, res20, alpha);
|
||||
C2 = C2 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C3, res30, alpha);
|
||||
C3 = C3 + 2;
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1*2;
|
||||
ptrbb += temp*4*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4;
|
||||
#endif
|
||||
|
||||
k = (bk<<3);
|
||||
bb = bb+k;
|
||||
i = (ldc<<3);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc*2;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*4*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
res02_r = 0;
|
||||
res02_i = 0;
|
||||
res03_r = 0;
|
||||
res03_i = 0;
|
||||
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
res11_r = 0;
|
||||
res11_i = 0;
|
||||
res12_r = 0;
|
||||
res12_i = 0;
|
||||
res13_r = 0;
|
||||
res13_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+4; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
MADD(res11, a1, b1);
|
||||
|
||||
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
|
||||
MADD(res02, a0, b0);
|
||||
MADD(res12, a0, b1);
|
||||
|
||||
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
|
||||
MADD(res03, a1, b0);
|
||||
MADD(res13, a1, b1);
|
||||
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res02, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res03, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res11, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res12, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res13, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 4; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*4*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x2 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
res11_r = 0;
|
||||
res11_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
MADD(res11, a1, b1);
|
||||
|
||||
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
MADD_ALPHA_N_STORE(C1, res11, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x2 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1*2;
|
||||
ptrbb = bb + off*2*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
|
||||
res10_r = 0;
|
||||
res10_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
MADD(res10, a0, b1);
|
||||
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+4;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
MADD_ALPHA_N_STORE(C1, res10, alpha);
|
||||
C1 = C1 + 2;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1*2;
|
||||
ptrbb += temp*2*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
|
||||
{
|
||||
C0 = C;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*4*2;
|
||||
ptrbb = bb + off*1*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
res02_r = 0;
|
||||
res02_i = 0;
|
||||
res03_r = 0;
|
||||
res03_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+4; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
|
||||
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
|
||||
MADD(res02, a0, b0);
|
||||
|
||||
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
|
||||
MADD(res03, a1, b0);
|
||||
|
||||
ptrba = ptrba+8;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res02, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res03, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 4; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*4*2;
|
||||
ptrbb += temp*1*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x1 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2*2;
|
||||
ptrbb = bb + off*1*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
res01_r = 0;
|
||||
res01_i = 0;
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
|
||||
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
|
||||
MADD(res01, a1, b0);
|
||||
|
||||
|
||||
ptrba = ptrba+4;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
MADD_ALPHA_N_STORE(C0, res01, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2*2;
|
||||
ptrbb += temp*1*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x1 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1*2;
|
||||
ptrbb = bb + off*1*2;
|
||||
#endif
|
||||
|
||||
res00_r = 0;
|
||||
res00_i = 0;
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k=0; k<temp; k++)
|
||||
{
|
||||
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
|
||||
|
||||
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
|
||||
MADD(res00, a0, b0);
|
||||
|
||||
ptrba = ptrba+2;
|
||||
ptrbb = ptrbb+2;
|
||||
}
|
||||
|
||||
MADD_ALPHA_N_STORE(C0, res00, alpha);
|
||||
C0 = C0 + 2;
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1*2;
|
||||
ptrbb += temp*1*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
return 0;
|
||||
}
|
40
param.h
40
param.h
|
@ -2214,6 +2214,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ZGEMM_DEFAULT_R 4096
|
||||
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(CORTEXA57)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
#define GEMM_DEFAULT_OFFSET_A 0
|
||||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 256
|
||||
#define CGEMM_DEFAULT_P 256
|
||||
#define ZGEMM_DEFAULT_P 128
|
||||
|
||||
#define SGEMM_DEFAULT_Q 240
|
||||
#define DGEMM_DEFAULT_Q 1024
|
||||
#define CGEMM_DEFAULT_Q 1024
|
||||
#define ZGEMM_DEFAULT_Q 512
|
||||
|
||||
#define SGEMM_DEFAULT_R 12288
|
||||
#define DGEMM_DEFAULT_R 4096
|
||||
#define CGEMM_DEFAULT_R 4096
|
||||
#define ZGEMM_DEFAULT_R 2048
|
||||
|
||||
|
||||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
|
|
Loading…
Reference in New Issue