Fix #686. Merge branch 'ashwinyes-develop' into develop

This commit is contained in:
Zhang Xianyi 2015-11-11 04:30:26 +08:00
commit e31948ceb0
89 changed files with 19338 additions and 841 deletions

1
.gitignore vendored
View File

@ -68,3 +68,4 @@ test/zblat2
test/zblat3 test/zblat3
build build
build.* build.*
*.swp

View File

@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a
endif endif
ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif

View File

@ -74,3 +74,5 @@ ARMV5
7.ARM 64-bit CPU: 7.ARM 64-bit CPU:
ARMV8 ARMV8
CORTEXA57

View File

@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
srandom(getpid()); srandom(getpid());
#endif #endif
for(j = 0; j < m; j++){ for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){ for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;

View File

@ -86,6 +86,7 @@ extern "C" {
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
#include <unistd.h> #include <unistd.h>
#endif #endif
#include <time.h>
#ifdef OS_LINUX #ifdef OS_LINUX
#include <malloc.h> #include <malloc.h>

View File

@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \ #define PROLOGUE \
.text ;\
.align 4 ;\
.global REALNAME ;\ .global REALNAME ;\
.func REALNAME ;\ .type REALNAME, %function ;\
REALNAME: REALNAME:
#define EPILOGUE #define EPILOGUE
@ -107,7 +109,11 @@ REALNAME:
#endif #endif
#define HUGE_PAGESIZE ( 4 << 20) #define HUGE_PAGESIZE ( 4 << 20)
#if defined(CORTEXA57)
#define BUFFER_SIZE (40 << 20)
#else
#define BUFFER_SIZE (16 << 20) #define BUFFER_SIZE (16 << 20)
#endif
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)

View File

@ -29,12 +29,19 @@
#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_ARMV8 1 #define CPU_ARMV8 1
#define CPU_CORTEXA57 2
static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN", "UNKNOWN",
"ARMV8" "ARMV8" ,
"CORTEXA57"
}; };
static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"cortexa57"
};
int get_feature(char *search) int get_feature(char *search)
{ {
@ -59,7 +66,7 @@ int get_feature(char *search)
fclose(infile); fclose(infile);
if( p == NULL ) return; if( p == NULL ) return 0;
t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( t = strtok(NULL," "))
@ -82,11 +89,30 @@ int detect(void)
p = (char *) NULL ; p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) while (fgets(buffer, sizeof(buffer), infile))
{ {
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL) {
if (strstr(p, "0xd07")) {
return CPU_CORTEXA57;
}
}
p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
(!strncmp("CPU architecture", buffer, 16)))
{ {
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
break; break;
@ -118,23 +144,13 @@ char *get_corename(void)
void get_architecture(void) void get_architecture(void)
{ {
printf("ARM"); printf("ARM64");
} }
void get_subarchitecture(void) void get_subarchitecture(void)
{ {
int d = detect(); int d = detect();
switch (d) printf("%s", cpuname[d]);
{
case CPU_ARMV8:
printf("ARMV8");
break;
default:
printf("UNKNOWN");
break;
}
} }
void get_subdirname(void) void get_subdirname(void)
@ -160,25 +176,31 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
break; break;
case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
printf("#define HAVE_NEON\n");
printf("#define HAVE_VFPV4\n");
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
} }
} }
void get_libname(void) void get_libname(void)
{ {
int d = detect(); int d = detect();
switch (d) printf("%s", cpuname_lower[d]);
{
case CPU_ARMV8:
printf("armv8\n");
break;
} }
}
void get_features(void) void get_features(void)
{ {

View File

@ -55,7 +55,7 @@
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG incx, incy; BLASLONG incx;
BLASLONG m_from, m_to, i; BLASLONG m_from, m_to, i;
#ifndef COMPLEX #ifndef COMPLEX
FLOAT result; FLOAT result;
@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
y = (FLOAT *)args -> c; y = (FLOAT *)args -> c;
incx = args -> ldb; incx = args -> ldb;
incy = args -> ldc;
m_from = 0; m_from = 0;
m_to = args -> m; m_to = args -> m;

View File

@ -43,7 +43,7 @@
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG lda, incx, incy; BLASLONG incx, incy;
BLASLONG i, m_from, m_to; BLASLONG i, m_from, m_to;
FLOAT alpha_r; FLOAT alpha_r;
#ifdef COMPLEX #ifdef COMPLEX
@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
incx = args -> lda; incx = args -> lda;
incy = args -> ldb; incy = args -> ldb;
lda = args -> ldc;
alpha_r = *((FLOAT *)args -> alpha + 0); alpha_r = *((FLOAT *)args -> alpha + 0);
#ifdef COMPLEX #ifdef COMPLEX

View File

@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
BLASLONG incx; BLASLONG incx;
BLASLONG i, m_from, m_to; BLASLONG i, m_from, m_to;
FLOAT alpha_r; FLOAT alpha_r;
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
FLOAT alpha_i; FLOAT alpha_i;
#endif #endif
@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
incx = args -> lda; incx = args -> lda;
alpha_r = *((FLOAT *)args -> alpha + 0); alpha_r = *((FLOAT *)args -> alpha + 0);
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
alpha_i = *((FLOAT *)args -> alpha + 1); alpha_i = *((FLOAT *)args -> alpha + 1);
#endif #endif

View File

@ -55,7 +55,7 @@
static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG lda, incx, incy; BLASLONG lda, incx;
BLASLONG m_from, m_to; BLASLONG m_from, m_to;
a = (FLOAT *)args -> a; a = (FLOAT *)args -> a;
@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
lda = args -> lda; lda = args -> lda;
incx = args -> ldb; incx = args -> ldb;
incy = args -> ldc;
m_from = 0; m_from = 0;
m_to = args -> m; m_to = args -> m;

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2; FLOAT atemp1, atemp2, btemp1, btemp2;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2; FLOAT atemp1, atemp2, btemp1, btemp2;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den; FLOAT ar, ai, br, bi, ratio, den;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den; FLOAT ar, ai, br, bi, ratio, den;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }

View File

@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; BLASLONG procs, num_cpu_m, num_cpu_n;
BLASLONG width, i, j; BLASLONG width, i, j;
BLASLONG divM, divN; BLASLONG divM, divN;

View File

@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASLONG is, min_i, div_n; BLASLONG is, min_i, div_n;
BLASLONG i, current; BLASLONG i, current;
BLASLONG l1stride, l2size; BLASLONG l1stride;
#ifdef TIMING #ifdef TIMING
BLASULONG rpcc_counter; BLASULONG rpcc_counter;
@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif #endif
) return 0; ) return 0;
l2size = GEMM_P * GEMM_Q;
#if 0 #if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
mypos, m_from, m_to, n_from, n_to, N_from, N_to); mypos, m_from, m_to, n_from, n_to, N_from, N_to);
@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
n = n_to - n_from; n = n_to - n_from;
} }
if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0; return 0;
} }

View File

@ -914,7 +914,6 @@ static volatile struct {
} memory[NUM_BUFFERS]; } memory[NUM_BUFFERS];
static int memory_initialized = 0; static int memory_initialized = 0;
static void gotoblas_memory_init(void);
/* Memory allocation routine */ /* Memory allocation routine */
/* procpos ... indicates where it comes from */ /* procpos ... indicates where it comes from */

View File

@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
#define LIBNAME "armv8" #define LIBNAME "armv8"
#define CORENAME "XGENE1" #define CORENAME "ARMV8"
#else
#endif #endif
#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA57 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif
#ifndef FORCE #ifndef FORCE

View File

@ -91,6 +91,27 @@
#endif #endif
#endif #endif
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_REAL)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_REAL)
#else
#define MODE (BLAS_SINGLE | BLAS_REAL)
#endif
#else
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_COMPLEX)
#else
#define MODE (BLAS_SINGLE | BLAS_COMPLEX)
#endif
#endif
#endif
static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef GEMM3M #ifndef GEMM3M
#ifndef HEMM #ifndef HEMM
@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
FLOAT *buffer; FLOAT *buffer;
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)
int nodes; int nodes;
#endif #endif
@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FLOAT *buffer; FLOAT *buffer;
FLOAT *sa, *sb; FLOAT *sa, *sb;
#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif
#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)
int nodes; int nodes;
#endif #endif
@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
args.nthreads /= nodes; args.nthreads /= nodes;
gemm_thread_mn(mode, &args, NULL, NULL, gemm_thread_mn(MODE, &args, NULL, NULL,
symm[4 | (side << 1) | uplo ], sa, sb, nodes); symm[4 | (side << 1) | uplo ], sa, sb, nodes);
} else { } else {
@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
#else #else
GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
#endif #endif

View File

@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
FLOAT *buffer; FLOAT *buffer;
int trans, uplo; int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;

View File

@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
FLOAT *buffer; FLOAT *buffer;
int trans, uplo; int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;

View File

@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT beta_i = BETA[1]; FLOAT beta_i = BETA[1];
FLOAT *buffer; FLOAT *buffer;
int trans, uplo; int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;

View File

@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
FLOAT *buffer; FLOAT *buffer;
int trans, uplo; int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;

View File

@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT alpha_r = ALPHA[0]; FLOAT alpha_r = ALPHA[0];
FLOAT alpha_i = ALPHA[1]; FLOAT alpha_i = ALPHA[1];
FLOAT *buffer; FLOAT *buffer;
int trans, uplo; int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;
trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;

View File

@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@

View File

@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf=0.0; FLOAT maxf=0.0;
if (n < 0 || inc_x < 1 ) return(maxf); if (n <= 0 || inc_x <= 0) return(maxf);
maxf=ABS(x[0]); maxf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {
if( ABS(x[ix]) > ABS(maxf) ) if( ABS(x[ix]) > maxf )
{ {
maxf = ABS(x[ix]); maxf = ABS(x[ix]);
} }

View File

@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf=0.0; FLOAT minf=0.0;
if (n < 0 || inc_x < 1 ) return(minf); if (n <= 0 || inc_x <= 0) return(minf);
minf=ABS(x[0]); minf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {
if( ABS(x[ix]) < ABS(minf) ) if( ABS(x[ix]) < minf )
{ {
minf = ABS(x[ix]); minf = ABS(x[ix]);
} }

View File

@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
if (n < 0 || inc_x < 1 ) return(sumf); if (n <= 0 || inc_x <= 0) return(sumf);
n *= inc_x; n *= inc_x;
while(i < n) while(i < n)

View File

@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=0.0; FLOAT maxf=0.0;
BLASLONG max=0; BLASLONG max=0;
if (n < 0 || inc_x < 1 ) return(max); if (n <= 0 || inc_x <= 0) return(max);
maxf=ABS(x[0]); maxf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {
if( ABS(x[ix]) > ABS(maxf) ) if( ABS(x[ix]) > maxf )
{ {
max = i; max = i;
maxf = ABS(x[ix]); maxf = ABS(x[ix]);

View File

@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=0.0; FLOAT minf=0.0;
BLASLONG min=0; BLASLONG min=0;
if (n < 0 || inc_x < 1 ) return(min); if (n <= 0 || inc_x <= 0) return(min);
minf=ABS(x[0]); minf=ABS(x[0]);
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {

View File

@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=0.0; FLOAT maxf=0.0;
BLASLONG max=0; BLASLONG max=0;
if (n < 0 || inc_x < 1 ) return(max); if (n <= 0 || inc_x <= 0) return(max);
maxf=x[0]; maxf=x[0];
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {

View File

@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=0.0; FLOAT minf=0.0;
BLASLONG min=0; BLASLONG min=0;
if (n < 0 || inc_x < 1 ) return(min); if (n <= 0 || inc_x <= 0) return(min);
minf=x[0]; minf=x[0];
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {

View File

@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf[2]; FLOAT maxf;
BLASLONG max=0; BLASLONG max=0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(max); if (n <= 0 || inc_x <= 0) return(max);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
maxf[0] = ABS(x[ix]); maxf = CABS1(x,0);
maxf[1] = ABS(x[ix+1]); ix += inc_x2;
i++;
while(i < n) while(i < n)
{ {
if( CABS1(x,ix) > CABS1(maxf,0) ) if( CABS1(x,ix) > maxf )
{ {
max = i; max = i;
maxf[0] = ABS(x[ix]); maxf = CABS1(x,ix);
maxf[1] = ABS(x[ix+1]);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;

View File

@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf[2]; FLOAT minf;
BLASLONG min=0; BLASLONG min=0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(min); if (n <= 0 || inc_x <= 0) return(min);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
minf[0] = ABS(x[ix]); minf = CABS1(x,0);
minf[1] = ABS(x[ix+1]); ix += inc_x2;
i++;
while(i < n) while(i < n)
{ {
if( CABS1(x,ix) < CABS1(minf,0) ) if( CABS1(x,ix) < minf )
{ {
min = i; min = i;
minf[0] = ABS(x[ix]); minf = CABS1(x,ix);
minf[1] = ABS(x[ix+1]);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;

View File

@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf=0.0; FLOAT maxf=0.0;
if (n < 0 || inc_x < 1 ) return(maxf); if (n <= 0 || inc_x <= 0) return(maxf);
maxf=x[0]; maxf=x[0];
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {

View File

@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf=0.0; FLOAT minf=0.0;
if (n < 0 || inc_x < 1 ) return(minf); if (n <= 0 || inc_x <= 0) return(minf);
minf=x[0]; minf=x[0];
ix += inc_x;
i++;
while(i < n) while(i < n)
{ {

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0; FLOAT absxi = 0.0;
if (n < 0 || inc_x < 1 ) return(0.0); if (n <= 0 || inc_x <= 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) ); if ( n == 1 ) return( ABS(x[0]) );
n *= inc_x; n *= inc_x;

View File

@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf[2]; FLOAT maxf;
BLASLONG max=0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(0.0); if (n <= 0 || inc_x <= 0) return(0.0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
maxf[0] = ABS(x[ix]); maxf = CABS1(x,0);
maxf[1] = ABS(x[ix+1]); ix += inc_x2;
i++;
while(i < n) while(i < n)
{ {
if( CABS1(x,ix) > CABS1(maxf,0) ) if( CABS1(x,ix) > maxf )
{ {
max = i; maxf = CABS1(x,ix);
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;
} }
return(CABS1(maxf,0)); return(maxf);
} }

View File

@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf[2]; FLOAT minf;
BLASLONG min=0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(0.0); if (n <= 0 || inc_x <= 0) return(0.0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;
minf[0] = ABS(x[ix]); minf = CABS1(x,0);
minf[1] = ABS(x[ix+1]); ix += inc_x2;
i++;
while(i < n) while(i < n)
{ {
if( CABS1(x,ix) < CABS1(minf,0) ) if( CABS1(x,ix) < minf )
{ {
min = i; minf = CABS1(x,ix);
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;
} }
return(CABS1(minf,0)); return(minf);
} }

View File

@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(sumf);
if (n <= 0 || inc_x <= 0) return(sumf);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;

View File

@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix,iy; BLASLONG ix,iy;
FLOAT temp; FLOAT temp;
BLASLONG inc_x2, inc_y2;
BLASLONG inc_x2; if ( n <= 0 ) return(0);
BLASLONG inc_y2;
if ( n < 0 ) return(0);
ix = 0; ix = 0;
iy = 0; iy = 0;

View File

@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2; BLASLONG inc_x2;
FLOAT temp; FLOAT temp;
if (n < 0 || inc_x < 1 ) return(0.0); if (n <= 0 || inc_x <= 0) return(0.0);
inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;

View File

@ -0,0 +1,91 @@
include $(KERNELDIR)/KERNEL.ARMV8
SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S
ISAMAXKERNEL = isamax.S
IDAMAXKERNEL = idamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S
SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S
SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S
SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S
DOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
SNRM2KERNEL = snrm2.S
DNRM2KERNEL = dnrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S
SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S
SCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S
SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S
SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S
SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S
STRMMKERNEL = strmm_kernel_4x4.S
DTRMMKERNEL = dtrmm_kernel_4x4.S
CTRMMKERNEL = ctrmm_kernel_4x4.S
ZTRMMKERNEL = ztrmm_kernel_4x4.S
SGEMMKERNEL = sgemm_kernel_4x4.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = dgemm_kernel_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = cgemm_kernel_4x4.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = zgemm_kernel_4x4.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o

249
kernel/arm64/amax.S Normal file
View File

@ -0,0 +1,249 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif
#if !defined(DOUBLE)
#define REG0 wzr
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro INIT_F1
ldr MAXF, [X], #SZ
#if defined(USE_ABS)
fabs MAXF, MAXF
#endif
.endm
.macro KERNEL_F1
ldr TMPF, [X], #SZ
#if defined(USE_ABS)
fabs TMPF, TMPF
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
.macro INIT_F4
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
#if defined(USE_ABS)
fabs v0.4s, v0.4s
#endif
#if defined(USE_MIN)
fminv MAXF, v0.4s
#else
fmaxv MAXF, v0.4s
#endif
#else // DOUBLE
ld2 {v0.2d,v1.2d}, [X], #32
#if defined(USE_ABS)
fabs v0.2d, v0.2d
fabs v1.2d, v1.2d
#endif
#if defined(USE_MIN)
fmin v0.2d, v0.2d, v1.2d
fminp MAXF, v0.2d
#else
fmax v0.2d, v0.2d, v1.2d
fmaxp MAXF, v0.2d
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v1.4s}, [X], #16
#if defined(USE_ABS)
fabs v1.4s, v1.4s
#endif
#if defined(USE_MIN)
fminv TMPF, v1.4s
#else
fmaxv TMPF, v1.4s
#endif
#else // DOUBLE
ld2 {v1.2d,v2.2d}, [X], #32
#if defined(USE_ABS)
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
#endif
#if defined(USE_MIN)
fmin v1.2d, v1.2d, v2.2d
fminp TMPF, v1.2d
#else
fmax v1.2d, v1.2d, v2.2d
fmaxp TMPF, v1.2d
#endif
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
ld1 {v0.s}[0], [X], INC_X
#else
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
#endif
#if defined(USE_ABS)
fabs MAXF, MAXF
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
#if defined(USE_ABS)
fabs TMPF, TMPF
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
amax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
amax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
ret
amax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
amax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
amax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
amax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
amax_kernel_L999:
ret
amax_kernel_zero:
fmov MAXF, REG0
ret
EPILOGUE

194
kernel/arm64/asum.S Normal file
View File

@ -0,0 +1,194 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fabs TMPF, TMPF
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
fabs v1.4s, v1.4s // ABS() each value
fabs v2.4s, v2.4s // ABS() each value
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
add X, X, #64
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d
PRFM PLDL1KEEP, [X, #1024]
fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v4.2d
#endif
.endm
.macro KERNEL_F8_FINALIZE
#if !defined(DOUBLE)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
#else
faddp SUMF, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fabs TMPF, TMPF
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
#if !defined(DOUBLE)
fmov s1, SUMF
#else
fmov d1, SUMF
#endif
cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
asum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
asum_kernel_L999:
ret
asum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
asum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
ret
EPILOGUE

209
kernel/arm64/axpy.S Normal file
View File

@ -0,0 +1,209 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define DA s0 /* scale input value */
#define TMPX s1
#define TMPVX {v1.s}[0]
#define TMPY s2
#define TMPVY {v2.s}[0]
#define SZ 4
#else
#define DA d0 /* scale input value */
#define TMPX d1
#define TMPVX {v1.d}[0]
#define TMPY d2
#define TMPVY {v2.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPX, [X], #SZ
ldr TMPY, [Y]
fmadd TMPY, TMPX, DA, TMPY
str TMPY, [Y], #SZ
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [Y]
fmla v2.4s, v1.4s, v0.s[0]
st1 {v2.4s}, [Y], #16
#else // DOUBLE
ld1 {v1.2d, v2.2d}, [X], #32
ld1 {v3.2d, v4.2d}, [Y]
fmla v3.2d, v1.2d, v0.d[0]
fmla v4.2d, v2.2d, v0.d[0]
st1 {v3.2d, v4.2d}, [Y], #32
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32
ld1 {v3.4s, v4.4s}, [Y]
fmla v3.4s, v1.4s, v0.s[0]
fmla v4.4s, v2.4s, v0.s[0]
st1 {v3.4s, v4.4s}, [Y], #32
#else // DOUBLE
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y]
fmla v16.2d, v1.2d, v0.d[0]
fmla v17.2d, v2.2d, v0.d[0]
fmla v18.2d, v3.2d, v0.d[0]
fmla v19.2d, v4.2d, v0.d[0]
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm
.macro KERNEL_S1
ld1 TMPVX, [X], INC_X
ldr TMPY, [Y]
fmadd TMPY, TMPX, DA, TMPY
st1 TMPVY, [Y], INC_Y
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble axpy_kernel_L999
fcmp DA, #0.0
beq axpy_kernel_L999
cmp INC_X, #1
bne axpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
axpy_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq axpy_kernel_F1
axpy_kernel_F8:
KERNEL_F8
subs I, I, #1
bne axpy_kernel_F8
axpy_kernel_F1:
ands I, N, #7
ble axpy_kernel_L999
axpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne axpy_kernel_F10
mov w0, wzr
ret
axpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1
axpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S4
axpy_kernel_S1:
ands I, N, #3
ble axpy_kernel_L999
axpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne axpy_kernel_S10
axpy_kernel_L999:
mov w0, wzr
ret

170
kernel/arm64/casum.S Normal file
View File

@ -0,0 +1,170 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2s}, [X], #8
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F8
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
add X, X, #64
fabs v1.4s, v1.4s
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s
PRFM PLDL1KEEP, [X, #1024]
fadd v1.4s, v1.4s, v2.4s
fadd v3.4s, v3.4s, v4.4s
fadd v0.4s, v0.4s, v1.4s
fadd v0.4s, v0.4s, v3.4s
.endm
.macro KERNEL_F8_FINALIZE
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
.endm
.macro KERNEL_S1
ld1 {v1.2s}, [X], INC_X
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
fmov s1, SUMF
cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq asum_kernel_F1
asum_kernel_F8:
KERNEL_F8
subs I, I, #1
bne asum_kernel_F8
KERNEL_F8_FINALIZE
asum_kernel_F1:
ands I, N, #7
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
asum_kernel_L999:
ret
asum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
asum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

232
kernel/arm64/copy.S Normal file
View File

@ -0,0 +1,232 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define TMPF s0
#define TMPVF {v0.s}[0]
#define SZ 4
#else
#define TMPF d0
#define TMPVF {v0.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
#if !defined(COMPLEX)
ldr TMPF, [X], #SZ
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
st1 {v0.2s}, [Y], #8
#else
ld1 {v0.2d}, [X], #16
st1 {v0.2d}, [Y], #16
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
st1 {v0.4s}, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
st1 {v2.4s}, [Y], #16
st1 {v3.4s}, [Y], #16
#endif
#endif
.endm
.macro INIT_S
#if !defined(COMPLEX)
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
#else
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
#endif
.endm
.macro KERNEL_S1
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ldr w10, [X]
add X, X, INC_X
str w10, [Y]
add Y, Y, INC_Y
#else
ldr x10, [X]
add X, X, INC_X
str x10, [Y]
add Y, Y, INC_Y
#endif
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X]
add X, X, INC_X
st1 {v0.2s}, [Y]
add Y, Y, INC_Y
#else
ld1 {v0.2d}, [X]
add X, X, INC_X
st1 {v0.2d}, [Y]
add Y, Y, INC_Y
#endif
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble copy_kernel_L999
cmp INC_X, #1
bne copy_kernel_S_BEGIN
cmp INC_Y, #1
bne copy_kernel_S_BEGIN
copy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq copy_kernel_F1
copy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne copy_kernel_F4
copy_kernel_F1:
ands I, N, #3
ble copy_kernel_L999
copy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne copy_kernel_F10
mov w0, wzr
ret
copy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble copy_kernel_S1
copy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne copy_kernel_S4
copy_kernel_S1:
ands I, N, #3
ble copy_kernel_L999
copy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne copy_kernel_S10
copy_kernel_L999:
mov w0, wzr
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

169
kernel/arm64/dnrm2.S Normal file
View File

@ -0,0 +1,169 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define TMPF d6
#define SSQ d0
#define TMPVF {v6.d}[0]
#define SZ 8
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
.macro KERNEL_F8
ld1 {v1.2d, v2.2d}, [X], #32
fmla v0.2d, v1.2d, v1.2d
fmla v5.2d, v2.2d, v2.2d
ld1 {v3.2d, v4.2d}, [X], #32
fmla v0.2d, v3.2d, v3.2d
fmla v5.2d, v4.2d, v4.2d
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro nrm2_kernel_F8_FINALIZE
fadd v0.2d, v0.2d, v5.2d
faddp SSQ, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #3
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SSQ, xzr
fmov d5, SSQ
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq nrm2_kernel_F1_INIT
nrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
nrm2_kernel_F8_FINALIZE
nrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_F1_INIT:
b nrm2_kernel_F1
nrm2_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble nrm2_kernel_L999
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
nrm2_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S4
nrm2_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret
nrm2_kernel_zero:
ret
EPILOGUE

227
kernel/arm64/dot.S Normal file
View File

@ -0,0 +1,227 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 wzr
#define DOTF s0
#else // DSDOT
#define REG0 xzr
#define DOTF d0
#endif
#define DOTI s1
#define TMPX s2
#define LD1VX {v2.s}[0]
#define TMPY s3
#define LD1VY {v3.s}[0]
#define TMPVY v3.s[0]
#define SZ 4
#else
#define REG0 xzr
#define DOTF d0
#define DOTI d1
#define TMPX d2
#define LD1VX {v2.d}[0]
#define TMPY d3
#define LD1VY {v3.d}[0]
#define TMPVY v3.d[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPX, [X], #SZ
ldr TMPY, [Y], #SZ
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d2, TMPX
fadd DOTF, DOTF, d2
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [Y], #16
#if !defined(DSDOT)
fmla v0.4s, v2.4s, v3.4s
#else
fmul v2.4s, v2.4s, v3.4s
ext v3.16b, v2.16b, v2.16b, #8
fcvtl v2.2d, v2.2s
fcvtl v3.2d, v3.2s
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [X], #32
ld1 {v4.2d, v5.2d}, [Y], #32
fmul v2.2d, v2.2d, v4.2d
fmul v3.2d, v3.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
.endm
.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
#if !defined(DSDOT)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp DOTF, v0.2s
#else
faddp DOTF, v0.2d
#endif
#else //DOUBLE
faddp DOTF, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm
.macro KERNEL_S1
ld1 LD1VX, [X], INC_X
ld1 LD1VY, [Y], INC_Y
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d2, TMPX
fadd DOTF, DOTF, d2
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov DOTF, REG0
#if defined(DOUBLE)
fmov d6, DOTF
#endif
cmp N, xzr
ble dot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
dot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
dot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
ret
dot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
dot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
dot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
dot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
dot_kernel_L999:
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

320
kernel/arm64/gemv_n.S Normal file
View File

@ -0,0 +1,320 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define Y_IPTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define Y_OPTR x13 /* loop Y vector address */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define ALPHA s0
#define TEMP s1
#define TEMPV {v1.s}[0]
#define TMP1 s2
#define TMPV1 {v2.s}[0]
#define TMP2 s3
#define TMPV2 {v3.s}[0]
#define SZ 4
#define SHZ 2
#else
#define ALPHA d0
#define TEMP d1
#define TEMPV {v1.d}[0]
#define TMP1 d2
#define TMPV1 {v2.d}[0]
#define TMP2 d3
#define TMPV2 {v3.d}[0]
#define SZ 8
#define SHZ 3
#endif
/******************************************************************************/
.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm
.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm
.macro KERNEL_F16
#if !defined(DOUBLE)
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32
ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32
ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32
ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32
ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [A_PTR], #16
ld1 {v3.4s}, [Y_IPTR], #16
fmla v3.4s, v1.4s, v2.4s
st1 {v3.4s}, [Y_OPTR], #16
#else
ld1 {v2.2d}, [A_PTR], #16
ld1 {v3.2d}, [Y_IPTR], #16
fmla v3.2d, v1.2d, v2.2d
st1 {v3.2d}, [Y_OPTR], #16
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], #16
fmla v5.2d, v1.2d, v4.2d
st1 {v5.2d}, [Y_OPTR], #16
#endif
.endm
.macro KERNEL_F1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [Y_IPTR]
fmadd TMP2, TEMP, TMP1, TMP2
st1 TMPV2, [Y_IPTR], #SZ
.endm
.macro INIT_S
lsl INC_Y, INC_Y, #SHZ
.endm
.macro KERNEL_S1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [Y_IPTR]
fmadd TMP2, TEMP, TMP1, TMP2
st1 TMPV2, [Y_IPTR], INC_Y
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
ldr INC_Y, [sp]
SAVE_REGS
cmp N, xzr
ble gemv_n_kernel_L999
cmp M, xzr
ble gemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N
cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
gemv_n_kernel_F_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
#if !defined(DOUBLE)
ins v1.s[1], v1.s[0]
ins v1.s[2], v1.s[0]
ins v1.s[3], v1.s[0]
#else
ins v1.d[1], v1.d[0]
#endif
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
gemv_n_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_n_kernel_F4
gemv_n_kernel_F320:
KERNEL_F16
KERNEL_F16
subs I, I, #1
bne gemv_n_kernel_F320
gemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_n_kernel_F1
gemv_n_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_n_kernel_F40
gemv_n_kernel_F1:
ands I, M, #3
ble gemv_n_kernel_F_END
gemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_n_kernel_F10
gemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_F_LOOP
b gemv_n_kernel_L999
gemv_n_kernel_S_BEGIN:
INIT_S
gemv_n_kernel_S_LOOP:
ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
mov A_PTR, A
mov Y_IPTR, Y
asr I, M, #2
cmp I, xzr
ble gemv_n_kernel_S1
gemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S4
gemv_n_kernel_S1:
ands I, M, #3
ble gemv_n_kernel_S_END
gemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_n_kernel_S10
gemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_S_LOOP
gemv_n_kernel_L999:
mov w0, wzr
RESTORE_REGS
ret
EPILOGUE

347
kernel/arm64/gemv_t.S Normal file
View File

@ -0,0 +1,347 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define X_PTR x10 /* loop X vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define REG0 wzr
#define ALPHA s0
#define TEMP s1
#define TEMP1 s2
#define TEMP2 s3
#define TEMP3 s4
#define TEMPV {v1.s}[0]
#define TMP1 s2
#define TMPV1 {v2.s}[0]
#define TMP2 s3
#define TMPV2 {v3.s}[0]
#define SZ 4
#define SHZ 2
#else
#define REG0 xzr
#define ALPHA d0
#define TEMP d1
#define TEMP1 d2
#define TEMP2 d3
#define TEMP3 d4
#define TEMPV {v1.d}[0]
#define TMP1 d2
#define TMPV1 {v2.d}[0]
#define TMP2 d3
#define TMPV2 {v3.d}[0]
#define SZ 8
#define SHZ 3
#endif
/******************************************************************************/
.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm
.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm
.macro KERNEL_F32
#if !defined(DOUBLE)
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
fmla v2.4s, v6.4s, v10.4s
fmla v3.4s, v7.4s, v11.4s
fmla v4.4s, v8.4s, v12.4s
ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
fmla v2.4s, v14.4s, v18.4s
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
fmla v2.2d, v6.2d, v10.2d
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
fmla v2.2d, v14.2d, v18.2d
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
fmla v2.2d, v6.2d, v10.2d
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d
ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
fmla v2.2d, v14.2d, v18.2d
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif
.endm
.macro KERNEL_F32_FINALIZE
#if !defined(DOUBLE)
fadd v1.4s, v1.4s, v2.4s
fadd v1.4s, v1.4s, v3.4s
fadd v1.4s, v1.4s, v4.4s
#else
fadd v1.2d, v1.2d, v2.2d
fadd v1.2d, v1.2d, v3.2d
fadd v1.2d, v1.2d, v4.2d
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [A_PTR], #16
ld1 {v3.4s}, [X_PTR], #16
fmla v1.4s, v2.4s, v3.4s
#else
ld1 {v2.2d}, [A_PTR], #16
ld1 {v3.2d}, [X_PTR], #16
fmla v1.2d, v2.2d, v3.2d
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [X_PTR], #16
fmla v1.2d, v4.2d, v5.2d
#endif
.endm
.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v2.16b, v1.16b, v1.16b, #8
fadd v1.2s, v1.2s, v2.2s
faddp TEMP, v1.2s
#else
faddp TEMP, v1.2d
#endif
.endm
.macro KERNEL_F1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [X_PTR], #SZ
fmadd TEMP, TMP1, TMP2, TEMP
.endm
.macro INIT_S
lsl INC_X, INC_X, #SHZ
.endm
.macro KERNEL_S1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [X_PTR], INC_X
fmadd TEMP, TMP1, TMP2, TEMP
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
ldr INC_Y, [sp]
SAVE_REGS
cmp N, xzr
ble gemv_t_kernel_L999
cmp M, xzr
ble gemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N
cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
gemv_t_kernel_F_LOOP:
fmov TEMP, REG0
fmov TEMP1, REG0
fmov TEMP2, REG0
fmov TEMP3, REG0
mov A_PTR, A
mov X_PTR, X
gemv_t_kernel_F32:
asr I, M, #5
cmp I, xzr
beq gemv_t_kernel_F4
gemv_t_kernel_F320:
KERNEL_F32
subs I, I, #1
bne gemv_t_kernel_F320
KERNEL_F32_FINALIZE
gemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_t_kernel_F1
gemv_t_kernel_F40:
KERNEL_F4
subs I, I, #1
bne gemv_t_kernel_F40
gemv_t_kernel_F1:
KERNEL_F4_FINALIZE
ands I, M, #3
ble gemv_t_kernel_F_END
gemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne gemv_t_kernel_F10
gemv_t_kernel_F_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
b gemv_t_kernel_L999
gemv_t_kernel_S_BEGIN:
INIT_S
gemv_t_kernel_S_LOOP:
fmov TEMP, REG0
mov A_PTR, A
mov X_PTR, X
asr I, M, #2
cmp I, xzr
ble gemv_t_kernel_S1
gemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S4
gemv_t_kernel_S1:
ands I, M, #3
ble gemv_t_kernel_S_END
gemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne gemv_t_kernel_S10
gemv_t_kernel_S_END:
ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
gemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret
EPILOGUE

124
kernel/arm64/idamax.S Normal file
View File

@ -0,0 +1,124 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
/******************************************************************************/
.macro INIT_S
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
fabs TMPF, TMPF
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
mov x0, xzr
ret
EPILOGUE

213
kernel/arm64/isamax.S Normal file
View File

@ -0,0 +1,213 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */
#define X_COPY x6 /* copy of X address */
#define MAXF_Z x7
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define MAXF s5
#define TMPF s6
#define TMPVF {v6.s}[0]
#define SZ 4
/******************************************************************************/
.macro INIT_F1
ldr MAXF, [X], #SZ
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
.endm
.macro KERNEL_F1
ldr TMPF, [X], #SZ
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm
.macro INIT_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv MAXF, v0.4s
mov Z, #5
mov MAXF_Z, #1
.endm
.macro KERNEL_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv TMPF, v0.4s
PRFM PLDL1KEEP, [X, #512]
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel MAXF_Z, MAXF_Z, Z, le
add Z, Z, #4
.endm
.macro KERNEL_F4_FINALIZE
mov INDEX, MAXF_Z
sub MAXF_Z, MAXF_Z, #1
lsl MAXF_Z, MAXF_Z, #2
add X_COPY, X_COPY, MAXF_Z
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
KERNEL_F4_FINALIZE_DONE:
.endm
.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs MAXF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
PRFM PLDL1KEEP, [X]
mov X_COPY, X
cmp INC_X, #1
bne iamax_kernel_S_BEGIN
iamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq iamax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq iamax_kernel_F4_FINALIZE
iamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne iamax_kernel_F4
iamax_kernel_F4_FINALIZE:
KERNEL_F4_FINALIZE
iamax_kernel_F1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999
iamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b iamax_kernel_F1
iamax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
mov x0, xzr
ret
EPILOGUE

151
kernel/arm64/izamax.S Normal file
View File

@ -0,0 +1,151 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif
#if !defined(DOUBLE)
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v0.2s}, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
lsl INC_X, INC_X, #4
ld1 {v0.2d}, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
add Z, Z, #1
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], INC_X
add Z, Z, #1
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero
INIT_S
subs N, N, #1
ble iamax_kernel_L999
asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1
iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S4
iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999
iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10
iamax_kernel_L999:
mov x0, INDEX
ret
iamax_kernel_zero:
mov x0, xzr
ret
EPILOGUE

243
kernel/arm64/rot.S Normal file
View File

@ -0,0 +1,243 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define C s0 /* scale input value */
#define S s1 /* scale input value */
#else
#define C d0 /* scale input value */
#define S d1 /* scale input value */
#endif
/******************************************************************************/
.macro INIT
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // [C, C]
#else
ins v0.d[1], v0.d[0] // [C, C]
#endif
.endm
.macro INIT_F1
#if !defined(DOUBLE)
fneg s2, S
ins v1.s[1], v2.s[0] // [-S, S]
#else
fneg d2, S
ins v1.d[1], v2.d[0] // [-S, S]
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.s}[0], [X]
ld1 {v2.s}[1], [Y] // [Y, X]
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
st1 {v4.s}[0], [X], #4
st1 {v4.s}[1], [Y], #4
#else
ld1 {v2.d}[0], [X]
ld1 {v2.d}[1], [Y] // [Y, X]
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
st1 {v4.d}[0], [X], #8
st1 {v4.d}[1], [Y], #8
#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
ins v0.d[1], v0.d[0] // [C, C, C, C]
ins v1.s[1], v1.s[0]
ins v1.d[1], v1.d[0] // [S, S, S, S]
#else
ins v1.d[1], v1.d[0] // [S, S]
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [X]
fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
ld1 {v3.4s}, [Y]
fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0
st1 {v4.4s}, [X], #16
fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0
fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
st1 {v5.4s}, [Y], #16
#else // DOUBLE
ld1 {v2.2d, v3.2d}, [X]
fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2
ld1 {v4.2d, v5.2d}, [Y]
fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2
st1 {v6.2d, v7.2d}, [X], #32
fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2
fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2
st1 {v16.2d, v17.2d}, [Y], #32
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.s}[0], [X]
ld1 {v2.s}[1], [Y] // [Y, X]
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
st1 {v4.s}[0], [X], INC_X
st1 {v4.s}[1], [Y], INC_Y
#else
ld1 {v2.d}[0], [X]
ld1 {v2.d}[1], [Y] // [Y, X]
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
st1 {v4.d}[0], [X], INC_X
st1 {v4.d}[1], [Y], INC_Y
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
rot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
INIT_F1
rot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
INIT_S
INIT_F1
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
rot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
rot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
rot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
rot_kernel_L999:
mov w0, wzr
ret

253
kernel/arm64/scal.S Normal file
View File

@ -0,0 +1,253 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x3 /* X vector address */
#define X_COPY x5 /* X vector address */
#define INC_X x4 /* X stride */
#define I x1 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define DA s0 /* scale input value */
#define DAV {v0.s}[0]
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define DA d0 /* scale input value */
#define DAV {v0.d}[0]
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
ldr TMPF, [X]
fmul TMPF, TMPF, DA
str TMPF, [X], #SZ
.endm
.macro KERNEL_INIT_F8
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0]
ins v0.s[2], v0.s[0]
ins v0.s[3], v0.s[0]
#else
ins v0.d[1], v0.d[0]
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X]
fmul v1.4s, v1.4s, v0.4s
fmul v2.4s, v2.4s, v0.4s
st1 {v1.4s, v2.4s}, [X], #32
#else // DOUBLE
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X]
fmul v1.2d, v1.2d, v0.2d
fmul v2.2d, v2.2d, v0.2d
fmul v3.2d, v3.2d, v0.2d
fmul v4.2d, v4.2d, v0.2d
st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm
.macro KERNEL_S1
ldr TMPF, [X]
fmul TMPF, TMPF, DA
st1 TMPVF, [X], INC_X
.endm
.macro KERNEL_S4
#if !defined(DOUBLE)
ldr s1, [X]
add X, X, INC_X
fmul s1, s1, s0
str s1, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr s2, [X]
add X, X, INC_X
fmul s2, s2, s0
str s2, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr s3, [X]
add X, X, INC_X
fmul s3, s3, s0
str s3, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr s4, [X]
add X, X, INC_X
fmul s4, s4, s0
str s4, [X_COPY]
add X_COPY, X_COPY, INC_X
#else
ldr d1, [X]
add X, X, INC_X
fmul d1, d1, d0
str d1, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr d2, [X]
add X, X, INC_X
fmul d2, d2, d0
str d2, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr d3, [X]
add X, X, INC_X
fmul d3, d3, d0
str d3, [X_COPY]
add X_COPY, X_COPY, INC_X
ldr d4, [X]
add X, X, INC_X
fmul d4, d4, d0
str d4, [X_COPY]
add X_COPY, X_COPY, INC_X
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble scal_kernel_L999
fcmp DA, #0.0
beq scal_kernel_zero
cmp INC_X, #1
bne scal_kernel_S_BEGIN
scal_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq scal_kernel_F1
KERNEL_INIT_F8
scal_kernel_F8:
KERNEL_F8
subs I, I, #1
bne scal_kernel_F8
scal_kernel_F1:
ands I, N, #7
ble scal_kernel_L999
scal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne scal_kernel_F10
mov w0, wzr
ret
scal_kernel_S_BEGIN:
INIT_S
mov X_COPY, X
asr I, N, #2
cmp I, xzr
ble scal_kernel_S1
scal_kernel_S4:
KERNEL_S4
subs I, I, #1
bne scal_kernel_S4
scal_kernel_S1:
ands I, N, #3
ble scal_kernel_L999
scal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne scal_kernel_S10
scal_kernel_L999:
mov w0, wzr
ret
scal_kernel_zero:
INIT_S
scal_kernel_Z1:
st1 DAV, [X], INC_X
subs N, N, #1
bne scal_kernel_Z1
mov w0, wzr
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

178
kernel/arm64/snrm2.S Normal file
View File

@ -0,0 +1,178 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define TMPF s6
#define SSQ s0
#define TMPVF {v6.s}[0]
#define SZ 4
/******************************************************************************/
.macro INIT_F1
ldr TMPF, [X], #SZ
fmul SSQ, TMPF, TMPF
.endm
.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
.macro INIT_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp SSQ, v2.2s
.endm
.macro KERNEL_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp TMPF, v2.2s
fadd SSQ, SSQ, TMPF
.endm
.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm
.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq nrm2_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq nrm2_kernel_F1
nrm2_kernel_F4:
KERNEL_F4
subs I, I, #1
bne nrm2_kernel_F4
nrm2_kernel_F1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b nrm2_kernel_F1
nrm2_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble nrm2_kernel_L999
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
nrm2_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S4
nrm2_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret
nrm2_kernel_zero:
fmov SSQ, wzr
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

266
kernel/arm64/swap.S Normal file
View File

@ -0,0 +1,266 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define TMP0 s0
#define TMPV0 {v0.s}[0]
#define TMP1 s1
#define TMPV1 {v1.s}[0]
#define SZ 4
#else
#define TMP0 d0
#define TMPV0 {v0.d}[0]
#define TMP1 d1
#define TMPV1 {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
#if !defined(COMPLEX)
ldr TMP0, [X]
ldr TMP1, [Y]
str TMP0, [Y], #SZ
str TMP1, [X], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X]
ld1 {v1.2s}, [Y]
st1 {v0.2s}, [Y], #8
st1 {v1.2s}, [X], #8
#else
ld1 {v0.2d}, [X]
ld1 {v1.2d}, [Y]
st1 {v0.2d}, [Y], #16
st1 {v1.2d}, [X], #16
#endif
#endif
.endm
.macro KERNEL_F8
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#else // DOUBLE
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#else // DOUBLE
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#endif
#endif
.endm
.macro INIT_S
#if !defined(COMPLEX)
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
#else
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
#endif
.endm
.macro KERNEL_S1
#if !defined(COMPLEX)
#if !defined(DOUBLE)
ldr w10, [X]
ldr w11, [Y]
str w10, [Y]
str w11, [X]
#else
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]
#endif
#else
#if !defined(DOUBLE)
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]
#else
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]
ldr x12, [X, #8]
ldr x13, [Y, #8]
str x12, [Y, #8]
str x13, [X, #8]
#endif
#endif
add Y, Y, INC_Y
add X, X, INC_X
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble swap_kernel_L999
cmp INC_X, #1
bne swap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN
swap_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq swap_kernel_F1
swap_kernel_F8:
KERNEL_F8
subs I, I, #1
bne swap_kernel_F8
swap_kernel_F1:
ands I, N, #7
ble swap_kernel_L999
swap_kernel_F10:
KERNEL_F1
subs I, I, #1
bne swap_kernel_F10
b swap_kernel_L999
swap_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble swap_kernel_S1
swap_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne swap_kernel_S4
swap_kernel_S1:
ands I, N, #3
ble swap_kernel_L999
swap_kernel_S10:
KERNEL_S1
subs I, I, #1
bne swap_kernel_S10
swap_kernel_L999:
mov w0, wzr
ret
EPILOGUE

273
kernel/arm64/zamax.S Normal file
View File

@ -0,0 +1,273 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif
#if !defined(DOUBLE)
#define REG0 wzr
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro INIT_F1
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
ld1 {v0.2d}, [X], #16
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], #8
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], #16
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
.macro INIT_F4
#if !defined(DOUBLE)
ld2 {v0.4s,v1.4s}, [X], #32
fabs v0.4s, v0.4s // [X6, X4, X2, X0]
fabs v1.4s, v1.4s // [X7, X5, X3, X1]
fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
#if defined(USE_MIN)
fminv MAXF, v0.4s
#else
fmaxv MAXF, v0.4s
#endif
#else // DOUBLE
ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64
fabs v0.2d, v0.2d
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fadd v0.2d, v0.2d, v1.2d
fadd v2.2d, v2.2d, v3.2d
#if defined(USE_MIN)
fmin v0.2d, v0.2d, v2.2d
fminp MAXF, v0.2d
#else
fmax v0.2d, v0.2d, v2.2d
fmaxp MAXF, v0.2d
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld2 {v1.4s,v2.4s}, [X], #32
fabs v1.4s, v1.4s // [X6, X4, X2, X0]
fabs v2.4s, v2.4s // [X7, X5, X3, X1]
fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
#if defined(USE_MIN)
fminv TMPF, v1.4s
#else
fmaxv TMPF, v1.4s
#endif
#else // DOUBLE
ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d
#if defined(USE_MIN)
fmin v1.2d, v1.2d, v3.2d
fminp MAXF, v1.2d
#else
fmax v1.2d, v1.2d, v3.2d
fmaxp MAXF, v1.2d
#endif
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v0.2s}, [X], INC_X
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
lsl INC_X, INC_X, #4
ld1 {v0.2d}, [X], INC_X
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], INC_X
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble amax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero
cmp INC_X, #1
bne amax_kernel_S_BEGIN
amax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT
INIT_F4
subs I, I, #1
beq amax_kernel_F1
amax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne amax_kernel_F4
amax_kernel_F1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne amax_kernel_F10
ret
amax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b amax_kernel_F1
amax_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble amax_kernel_L999
asr I, N, #2
cmp I, xzr
ble amax_kernel_S1
amax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne amax_kernel_S4
amax_kernel_S1:
ands I, N, #3
ble amax_kernel_L999
amax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne amax_kernel_S10
amax_kernel_L999:
ret
amax_kernel_zero:
fmov MAXF, REG0
ret
EPILOGUE

164
kernel/arm64/zasum.S Normal file
View File

@ -0,0 +1,164 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
/******************************************************************************/
.macro KERNEL_F1
ld1 {v1.2d}, [X], #16
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
.macro KERNEL_F4
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d
fadd v0.2d, v0.2d, v1.2d
fadd v0.2d, v0.2d, v3.2d
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro KERNEL_F4_FINALIZE
faddp SUMF, v0.2d
.endm
.macro INIT_S
lsl INC_X, INC_X, #4
.endm
.macro KERNEL_S1
ld1 {v1.2d}, [X], INC_X
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov SUMF, REG0
cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999
cmp INC_X, #1
bne asum_kernel_S_BEGIN
asum_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq asum_kernel_F1
asum_kernel_F4:
KERNEL_F4
subs I, I, #1
bne asum_kernel_F4
KERNEL_F4_FINALIZE
asum_kernel_F1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_F10:
KERNEL_F1
subs I, I, #1
bne asum_kernel_F10
asum_kernel_L999:
ret
asum_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble asum_kernel_S1
asum_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne asum_kernel_S4
asum_kernel_S1:
ands I, N, #3
ble asum_kernel_L999
asum_kernel_S10:
KERNEL_S1
subs I, I, #1
bne asum_kernel_S10
ret
EPILOGUE

301
kernel/arm64/zaxpy.S Normal file
View File

@ -0,0 +1,301 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */
#define Y_COPY x7 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define DA_R s0 /* scale input value */
#define DA_I s1 /* scale input value */
#define TMPX v2.2s
#define TMPY v3.2s
#define SZ 4
#else
#define DA_R d0 /* scale input value */
#define DA_I d1 /* scale input value */
#define TMPX v2.2d
#define TMPY v3.2d
#define SZ 8
#endif
/******************************************************************************/
.macro INIT
#if !defined(CONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
#else
#if !defined(DOUBLE)
fneg s2, DA_R
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
#else
fneg d2, DA_R
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
#endif
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2s}, [Y], #8
#else
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2d}, [Y], #16
#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
// V5 = Y[7], Y[6], Y[5], Y[4]
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v4.4s}, [Y], #16
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += +-DA_R * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v5.4s}, [Y], #16
#else // DOUBLE
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
fmla v16.2d, v0.2d, v2.2d
fmla v17.2d, v0.2d, v3.2d
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3
fmla v16.2d, v1.2d, v20.2d
fmla v17.2d, v1.2d, v21.2d
st1 {v16.2d,v17.2d}, [Y], #32
fmla v18.2d, v0.2d, v4.2d
fmla v19.2d, v0.2d, v5.2d
fmla v18.2d, v1.2d, v22.2d
fmla v19.2d, v1.2d, v23.2d
st1 {v18.2d,v19.2d}, [Y], #32
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2s}, [Y], INC_Y
#else
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2d}, [Y], INC_Y
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble zaxpy_kernel_L999
mov Y_COPY, Y
fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
beq zaxpy_kernel_L999
.L1:
INIT
cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
zaxpy_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zaxpy_kernel_F1
KERNEL_INIT_F4
zaxpy_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zaxpy_kernel_F4
zaxpy_kernel_F1:
ands I, N, #3
ble zaxpy_kernel_L999
zaxpy_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zaxpy_kernel_F10
mov w0, wzr
ret
zaxpy_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zaxpy_kernel_S1
zaxpy_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S4
zaxpy_kernel_S1:
ands I, N, #3
ble zaxpy_kernel_L999
zaxpy_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zaxpy_kernel_S10
zaxpy_kernel_L999:
mov w0, wzr
ret

302
kernel/arm64/zdot.S Normal file
View File

@ -0,0 +1,302 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 wzr
#define DOTF s0
#else // DSDOT
#define REG0 xzr
#define DOTF d0
#endif
#define DOTI s1
#define TMPX s2
#define LD1VX {v2.s}[0]
#define TMPY s3
#define LD1VY {v3.s}[0]
#define TMPVY v3.s[0]
#define SZ 4
#else
#define REG0 xzr
#define DOTF d0
#define DOTI d1
#define TMPX d2
#define LD1VX {v2.d}[0]
#define TMPY d3
#define LD1VY {v3.d}[0]
#define TMPVY v3.d[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2
ins v4.s[0], v2.s[1] // V4 = X[ix+1]
#if !defined(CONJ)
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#else // DOUBLE
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2
ins v4.d[0], v2.d[1] // V4 = X[ix+1]
#if !defined(CONJ)
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2
fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy]
fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1]
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
#if !defined(CONJ)
fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1]
fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy]
#else
fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1]
fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy]
#endif
#else // DOUBLE
ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
ld2 {v16.2d, v17.2d}, [Y], #32
fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy]
fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1]
ld2 {v4.2d, v5.2d}, [X], #32
ld2 {v18.2d, v19.2d}, [Y], #32
fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1]
fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1]
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
#if !defined(CONJ)
fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1]
fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1]
fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy]
fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy]
#else
fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1]
fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1]
fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy]
fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy]
#endif
#endif
.endm
.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v2.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v2.2s
faddp DOTF, v0.2s
ext v3.16b, v1.16b, v1.16b, #8
fadd v1.2s, v1.2s, v3.2s
faddp DOTI, v1.2s
#else
fadd v0.2d, v0.2d, v20.2d
faddp DOTF, v0.2d
fadd v1.2d, v1.2d, v21.2d
faddp DOTI, v1.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
#if !defined(CONJ)
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#else // DOUBLE
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
#if !defined(CONJ)
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
fmov DOTF, REG0
fmov DOTI, DOTF
#if !defined(DOUBLE)
fmov s20, DOTF
fmov s21, DOTI
#else
fmov d20, DOTF
fmov d21, DOTI
#endif
cmp N, xzr
ble dot_kernel_L999
cmp INC_X, #1
bne dot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN
dot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq dot_kernel_F1
dot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne dot_kernel_F4
KERNEL_F4_FINALIZE
dot_kernel_F1:
ands I, N, #3
ble dot_kernel_L999
dot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne dot_kernel_F10
ret
dot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble dot_kernel_S1
dot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne dot_kernel_S4
dot_kernel_S1:
ands I, N, #3
ble dot_kernel_L999
dot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne dot_kernel_S10
dot_kernel_L999:
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

514
kernel/arm64/zgemv_n.S Normal file
View File

@ -0,0 +1,514 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define Y_IPTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif
/******************************************************************************/
.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm
.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm
.macro INIT
/********** INIT FOR F4 LOOP **********/
fmov ALPHA_R_COPY, ALPHA_R
fmov ALPHA_I_COPY, ALPHA_I
#if !defined(DOUBLE)
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
ins v7.d[1], v7.d[0]
ins v8.d[1], v8.d[0]
#else
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
#endif
/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
fneg s2, ALPHA_I
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
#endif
#else
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
fneg d2, ALPHA_I
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
#endif
#endif
.endm
.macro INIT_LOOP
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
ins v10.s[0], v9.s[1]
ins v9.s[1], v9.s[0] // [R(X), R(X)]
ins v10.s[1], v10.s[0] // [I(X), I(X)]
ins v9.d[1], v9.d[0]
ins v10.d[1], v10.d[0]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
fneg s4, s3
ins v3.s[1], v4.s[0]
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#else
fneg s4, s3
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
fneg s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#else
fneg s3, s3
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
fneg s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ
#else // DOUBLE
/********** INIT_LOOP FOR F4 LOOP **********/
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
ins v10.d[0], v9.d[1]
ins v9.d[1], v9.d[0] // [R(X), R(X)]
ins v10.d[1], v10.d[0] // [I(X), I(X)]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ
/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
fneg d4, d3 // -I(TEMP)
ins v3.d[1], v4.d[0]
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#else
fneg d4, d3 // -I(TEMP)
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
fneg d4, d2 // -R(TEMP)
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#else
fneg d3, d3 // -I(TEMP)
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
fneg d4, d2 // -R(TEMP)
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ
#endif // DOUBLE
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v15.4s, v16.4s}, [Y_OPTR], #32
#else // DOUBLE
ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v15.2d, v16.2d}, [Y_OPTR], #32
ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v19.2d, v20.2d}, [Y_OPTR], #32
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v4.2s}, [A_PTR], #8
ld1 {v5.2s}, [Y_IPTR], #8
ext v6.8b, v4.8b, v4.8b, #4
fmla v5.2s, v2.2s, v4.2s
fmla v5.2s, v3.2s, v6.2s
st1 {v5.2s}, [Y_OPTR], #8
#else // DOUBLE
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], #16
ext v6.16b, v4.16b, v4.16b, #8
fmla v5.2d, v2.2d, v4.2d
fmla v5.2d, v3.2d, v6.2d
st1 {v5.2d}, [Y_OPTR], #16
#endif
.endm
.macro INIT_S
lsl INC_Y, INC_Y, #SHZ
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v4.2s}, [A_PTR], #8
ld1 {v5.2s}, [Y_IPTR], INC_Y
ext v6.8b, v4.8b, v4.8b, #4
fmla v5.2s, v2.2s, v4.2s
fmla v5.2s, v3.2s, v6.2s
st1 {v5.2s}, [Y_OPTR], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], INC_Y
ext v6.16b, v4.16b, v4.16b, #8
fmla v5.2d, v2.2d, v4.2d
fmla v5.2d, v3.2d, v6.2d
st1 {v5.2d}, [Y_OPTR], INC_Y
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
ldr INC_Y, [sp]
SAVE_REGS
cmp N, xzr
ble zgemv_n_kernel_L999
cmp M, xzr
ble zgemv_n_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N
INIT
cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
zgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
mov X_PTR, X
add X, X, INC_X
INIT_LOOP
asr I, M, #2
cmp I, xzr
beq zgemv_n_kernel_F1
zgemv_n_kernel_F4:
KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1
subs I, I, #1
bne zgemv_n_kernel_F4
zgemv_n_kernel_F1:
ands I, M, #3
ble zgemv_n_kernel_F_END
zgemv_n_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_n_kernel_F10
zgemv_n_kernel_F_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_F_LOOP
b zgemv_n_kernel_L999
zgemv_n_kernel_S_BEGIN:
INIT_S
zgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
mov X_PTR, X
add X, X, INC_X
INIT_LOOP
asr I, M, #2
cmp I, xzr
ble zgemv_n_kernel_S1
zgemv_n_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S4
zgemv_n_kernel_S1:
ands I, M, #3
ble zgemv_n_kernel_S_END
zgemv_n_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_n_kernel_S10
zgemv_n_kernel_S_END:
add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_S_LOOP
zgemv_n_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret
EPILOGUE

448
kernel/arm64/zgemv_t.S Normal file
View File

@ -0,0 +1,448 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define X_PTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif
/******************************************************************************/
.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm
.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm
.macro INIT
#if !defined(XCONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
fneg s2, ALPHA_I
ins v1.s[1], v2.s[0]
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
#else
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
fneg d2, ALPHA_I
ins v1.d[1], v2.d[0]
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
#endif
#else // XCONJ
#if !defined(DOUBLE)
fneg s2, ALPHA_R
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
#else
fneg d2, ALPHA_R
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
#endif
#endif
.endm
.macro INIT_LOOP
fmov d9, xzr // TEMP_R = [0, 0]
fmov d10, xzr // TEMP_I = [0, 0]
#if !defined(DOUBLE)
#else
fmov d15, xzr // TEMP_R = [0, 0]
fmov d16, xzr // TEMP_I = [0, 0]
#endif
fmov d2, xzr // TEMP = [0, 0]
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#endif // CONJ
#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#endif // CONJ
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#endif // CONJ
#endif //DOUBLE
.endm
.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v21.16b, v9.16b, v9.16b, #8
fadd v9.2s, v9.2s, v21.2s
faddp s9, v9.2s
ext v21.16b, v10.16b, v10.16b, #8
fadd v10.2s, v10.2s, v21.2s
faddp s10, v10.2s
ins v2.s[0], v9.s[0]
ins v2.s[1], v10.s[0]
#else
fadd v9.2d, v9.2d, v15.2d
fadd v10.2d, v10.2d, v16.2d
faddp d9, v9.2d
faddp d10, v10.2d
ins v2.d[0], v9.d[0]
ins v2.d[1], v10.d[0]
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
fneg s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
#endif
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
fmla v2.2s, v4.2s, v6.2s
fmla v2.2s, v5.2s, v7.2s
#else // DOUBLE
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
fneg d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
#endif
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
fmla v2.2d, v4.2d, v6.2d
fmla v2.2d, v5.2d, v7.2d
#endif
.endm
.macro INIT_S
lsl INC_X, INC_X, #SHZ
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
fneg s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
#endif
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
fmla v2.2s, v4.2s, v6.2s
fmla v2.2s, v5.2s, v7.2s
#else // DOUBLE
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
fneg d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
#endif
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
fmla v2.2d, v4.2d, v6.2d
fmla v2.2d, v5.2d, v7.2d
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
ldr INC_Y, [sp]
SAVE_REGS
cmp N, xzr
ble zgemv_t_kernel_L999
cmp M, xzr
ble zgemv_t_kernel_L999
lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N
INIT
cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
zgemv_t_kernel_F_LOOP:
mov A_PTR, A
mov X_PTR, X
INIT_LOOP
asr I, M, #2
cmp I, xzr
beq zgemv_t_kernel_F1
zgemv_t_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zgemv_t_kernel_F4
KERNEL_F4_FINALIZE
zgemv_t_kernel_F1:
ands I, M, #3
ble zgemv_t_kernel_F_END
zgemv_t_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zgemv_t_kernel_F10
zgemv_t_kernel_F_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
fmla v4.2s, v0.2s, v2.2s
fmla v4.2s, v1.2s, v3.2s
st1 {v4.2s}, [Y], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [Y]
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
fmla v4.2d, v0.2d, v2.2d
fmla v4.2d, v1.2d, v3.2d
st1 {v4.2d}, [Y], INC_Y
#endif
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_F_LOOP
b zgemv_t_kernel_L999
zgemv_t_kernel_S_BEGIN:
INIT_S
zgemv_t_kernel_S_LOOP:
mov A_PTR, A
mov X_PTR, X
INIT_LOOP
asr I, M, #2
cmp I, xzr
ble zgemv_t_kernel_S1
zgemv_t_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S4
zgemv_t_kernel_S1:
ands I, M, #3
ble zgemv_t_kernel_S_END
zgemv_t_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zgemv_t_kernel_S10
zgemv_t_kernel_S_END:
#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
fmla v4.2s, v0.2s, v2.2s
fmla v4.2s, v1.2s, v3.2s
st1 {v4.2s}, [Y], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [Y]
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
fmla v4.2d, v0.2d, v2.2d
fmla v4.2d, v1.2d, v3.2d
st1 {v4.2d}, [Y], INC_Y
#endif
add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_S_LOOP
zgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret
EPILOGUE

228
kernel/arm64/znrm2.S Normal file
View File

@ -0,0 +1,228 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define TMPF s6
#define SSQ s0
#define TMPVF {v6.s}[0]
#define SZ 4
#else
#define TMPF d6
#define SSQ d0
#define TMPVF {v6.d}[0]
#define SZ 8
#endif
/******************************************************************************/
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], #8
fmul v1.2s, v1.2s, v1.2s
faddp TMPF, v1.2s
fadd SSQ, SSQ, TMPF
#else
ld1 {v1.2d}, [X], #16
fmul v1.2d, v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SSQ, SSQ, TMPF
#endif
.endm
.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32
fmla v0.4s, v1.4s, v1.4s
fmla v5.4s, v2.4s, v2.4s
ld1 {v3.4s,v4.4s}, [X], #32
fmla v0.4s, v3.4s, v3.4s
fmla v5.4s, v4.4s, v4.4s
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v1.2d, v2.2d}, [X], #32
fmla v0.2d, v1.2d, v1.2d
fmla v5.2d, v2.2d, v2.2d
ld1 {v3.2d, v4.2d}, [X], #32
fmla v0.2d, v3.2d, v3.2d
fmla v5.2d, v4.2d, v4.2d
ld1 {v16.2d, v17.2d}, [X], #32
fmla v0.2d, v16.2d, v16.2d
fmla v5.2d, v17.2d, v17.2d
ld1 {v18.2d, v19.2d}, [X], #32
fmla v0.2d, v18.2d, v18.2d
fmla v5.2d, v19.2d, v19.2d
#endif
.endm
.macro nrm2_kernel_F8_FINALIZE
#if !defined(DOUBLE)
fadd v0.4s, v0.4s, v5.4s
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SSQ, v0.2s
#else
fadd v0.2d, v0.2d, v5.2d
faddp SSQ, v0.2d
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v1.2s}, [X], INC_X
fmul v1.2s, v1.2s, v1.2s
faddp SSQ, v1.2s
#else
lsl INC_X, INC_X, #4
ld1 {v1.2d}, [X], INC_X
fmul v1.2d, v1.2d, v1.2d
faddp SSQ, v1.2d
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
fmul v1.2s, v1.2s, v1.2s
faddp TMPF, v1.2s
fadd SSQ, SSQ, TMPF
#else
ld1 {v1.2d}, [X], INC_X
fmul v1.2d, v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SSQ, SSQ, TMPF
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
#if !defined(DOUBLE)
fmov SSQ, wzr
fmov s5, SSQ
#else
fmov SSQ, xzr
fmov d5, SSQ
#endif
cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
nrm2_kernel_F_BEGIN:
asr I, N, #3
cmp I, xzr
beq nrm2_kernel_F1_INIT
nrm2_kernel_F8:
KERNEL_F8
subs I, I, #1
bne nrm2_kernel_F8
nrm2_kernel_F8_FINALIZE
nrm2_kernel_F1:
ands I, N, #7
ble nrm2_kernel_L999
nrm2_kernel_F10:
KERNEL_F1
subs I, I, #1
bne nrm2_kernel_F10
b nrm2_kernel_L999
nrm2_kernel_F1_INIT:
b nrm2_kernel_F1
nrm2_kernel_S_BEGIN:
INIT_S
subs N, N, #1
ble nrm2_kernel_L999
asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1
nrm2_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S4
nrm2_kernel_S1:
ands I, N, #3
ble nrm2_kernel_L999
nrm2_kernel_S10:
KERNEL_S1
subs I, I, #1
bne nrm2_kernel_S10
nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret
nrm2_kernel_zero:
ret
EPILOGUE

256
kernel/arm64/zrot.S Normal file
View File

@ -0,0 +1,256 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define C s0 /* scale input value */
#define S s1 /* scale input value */
#else
#define C d0 /* scale input value */
#define S d1 /* scale input value */
#endif
/******************************************************************************/
.macro INIT
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // [C, C]
ins v1.s[1], v1.s[0] // [S, S]
#else
ins v0.d[1], v0.d[0] // [C, C]
ins v1.d[1], v1.d[0] // [S, S]
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X]
ld1 {v3.2s}, [Y]
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2s}, [X], #8
st1 {v5.2s}, [Y], #8
#else
ld1 {v2.2d}, [X]
ld1 {v3.2d}, [Y]
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2d}, [X], #16
st1 {v5.2d}, [Y], #16
#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
ins v0.d[1], v0.d[0] // [C, C, C, C]
ins v1.d[1], v1.d[0] // [S, S, S, S]
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s, v3.4s}, [X]
ld1 {v4.4s, v5.4s}, [Y]
fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4
fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.4s,v7.4s}, [X], #32
st1 {v16.4s,v17.4s}, [Y], #32
#else // DOUBLE
ld1 {v2.2d, v3.2d}, [X]
ld1 {v4.2d, v5.2d}, [Y]
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.2d,v7.2d}, [X], #32
st1 {v16.2d,v17.2d}, [Y], #32
ld1 {v2.2d, v3.2d}, [X]
ld1 {v4.2d, v5.2d}, [Y]
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.2d,v7.2d}, [X], #32
st1 {v16.2d,v17.2d}, [Y], #32
#endif
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X]
ld1 {v3.2s}, [Y]
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2s}, [X], INC_X
st1 {v5.2s}, [Y], INC_Y
#else
ld1 {v2.2d}, [X]
ld1 {v3.2d}, [Y]
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2d}, [X], INC_X
st1 {v5.2d}, [Y], INC_Y
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble rot_kernel_L999
INIT
cmp INC_X, #1
bne rot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN
rot_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq rot_kernel_F1
KERNEL_INIT_F4
rot_kernel_F4:
KERNEL_F4
subs I, I, #1
bne rot_kernel_F4
rot_kernel_F1:
ands I, N, #3
ble rot_kernel_L999
rot_kernel_F10:
KERNEL_F1
subs I, I, #1
bne rot_kernel_F10
mov w0, wzr
ret
rot_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble rot_kernel_S1
rot_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne rot_kernel_S4
rot_kernel_S1:
ands I, N, #3
ble rot_kernel_L999
rot_kernel_S10:
KERNEL_S1
subs I, I, #1
bne rot_kernel_S10
rot_kernel_L999:
mov w0, wzr
ret

274
kernel/arm64/zscal.S Normal file
View File

@ -0,0 +1,274 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define I x5 /* loop variable */
/*******************************************************************************
* Macro definitions
*******************************************************************************/
#if !defined(DOUBLE)
#define DA_R s0 /* real scale input value */
#define DA_I s1 /* imaginary scale input value */
#else
#define DA_R d0 /* real scale input value */
#define DA_I d1 /* imaginary scale input value */
#endif
/******************************************************************************/
.macro INIT
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
.endm
.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], #8
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], #16
#endif
.endm
.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
#endif
.endm
.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]
st1 {v2.4s,v3.4s}, [X], #32
#else // DOUBLE
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v20.2d
fmul v3.2d, v0.2d, v3.2d
fmla v3.2d, v1.2d, v21.2d
st1 {v2.2d,v3.2d}, [X], #32
fmul v4.2d, v0.2d, v4.2d
fmla v4.2d, v1.2d, v22.2d
fmul v5.2d, v0.2d, v5.2d
fmla v5.2d, v1.2d, v23.2d
st1 {v4.2d,v5.2d}, [X], #32
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm
.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
#else
lsl INC_X, INC_X, #4
#endif
.endm
.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], INC_X
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], INC_X
#endif
.endm
/*******************************************************************************
* End of macro definitions
*******************************************************************************/
PROLOGUE
cmp N, xzr
ble zscal_kernel_L999
fcmp DA_R, #0.0
bne zscal_kernel_1
fcmp DA_I, #0.0
beq zscal_kernel_zero
// TODO: special case DA_R == 0 && DA_I != 0
zscal_kernel_1:
// TODO: special case DA_R != 0 && DA_I == 0
INIT
cmp INC_X, #1
bne zscal_kernel_S_BEGIN
zscal_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq zscal_kernel_F1
KERNEL_INIT_F4
zscal_kernel_F4:
KERNEL_F4
subs I, I, #1
bne zscal_kernel_F4
zscal_kernel_F1:
ands I, N, #3
ble zscal_kernel_L999
zscal_kernel_F10:
KERNEL_F1
subs I, I, #1
bne zscal_kernel_F10
mov w0, wzr
ret
zscal_kernel_S_BEGIN:
INIT_S
asr I, N, #2
cmp I, xzr
ble zscal_kernel_S1
zscal_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S4
zscal_kernel_S1:
ands I, N, #3
ble zscal_kernel_L999
zscal_kernel_S10:
KERNEL_S1
subs I, I, #1
bne zscal_kernel_S10
zscal_kernel_L999:
mov w0, wzr
ret
zscal_kernel_zero:
INIT_S
zscal_kernel_Z1:
stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_Z1
mov w0, wzr
ret
EPILOGUE

File diff suppressed because it is too large Load Diff

883
kernel/generic/ztrmmkernel_4x4.c Executable file
View File

@ -0,0 +1,883 @@
#include "common.h"
#define MADD_ALPHA_N_STORE(C, res, alpha) \
C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r -= op1 ## _i * op2 ## _i; \
res ## _i += op1 ## _r * op2 ## _i; \
res ## _i += op1 ## _i * op2 ## _r;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r += op1 ## _i * op2 ## _i; \
res ## _i -= op1 ## _r * op2 ## _i; \
res ## _i += op1 ## _i * op2 ## _r;
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r += op1 ## _i * op2 ## _i; \
res ## _i += op1 ## _r * op2 ## _i; \
res ## _i -= op1 ## _i * op2 ## _r;
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r -= op1 ## _i * op2 ## _i; \
res ## _i -= op1 ## _r * op2 ## _i; \
res ## _i -= op1 ## _i * op2 ## _r;
#endif
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
, BLASLONG offset
)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
FLOAT res00_r, res01_r, res02_r, res03_r;
FLOAT res00_i, res01_i, res02_i, res03_i;
FLOAT res10_r, res11_r, res12_r, res13_r;
FLOAT res10_i, res11_i, res12_i, res13_i;
FLOAT res20_r, res21_r, res22_r, res23_r;
FLOAT res20_i, res21_i, res22_i, res23_i;
FLOAT res30_r, res31_r, res32_r, res33_r;
FLOAT res30_i, res31_i, res32_i, res33_i;
FLOAT a0_r, a1_r;
FLOAT a0_i, a1_i;
FLOAT b0_r, b1_r, b2_r, b3_r;
FLOAT b0_i, b1_i, b2_i, b3_i;
BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
{
C0 = C;
C1 = C0+2*ldc;
C2 = C1+2*ldc;
C3 = C2+2*ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x4
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4*2; // number of values in A
ptrbb = bb + off*4*2; // number of values in B
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;
res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
res12_r = 0;
res12_i = 0;
res13_r = 0;
res13_i = 0;
res20_r = 0;
res20_i = 0;
res21_r = 0;
res21_i = 0;
res22_r = 0;
res22_i = 0;
res23_r = 0;
res23_i = 0;
res30_r = 0;
res30_i = 0;
res31_r = 0;
res31_i = 0;
res32_r = 0;
res32_i = 0;
res33_r = 0;
res33_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 4;
#else
temp = off + 4;
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
MADD(res21, a1, b2);
MADD(res31, a1, b3);
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);
MADD(res12, a0, b1);
MADD(res22, a0, b2);
MADD(res32, a0, b3);
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);
MADD(res13, a1, b1);
MADD(res23, a1, b2);
MADD(res33, a1, b3);
ptrba = ptrba+8;
ptrbb = ptrbb+8;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res12, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res13, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res21, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res22, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res23, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res31, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res32, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res33, alpha);
C3 = C3 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#if defined(LEFT)
temp = temp - 4;
#else
temp = temp - 4;
#endif
ptrba += temp*4*2; // number of values in A
ptrbb += temp*4*2; // number of values in B
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
}
if ( bm & 2 ) // do any 2x4 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*4*2;
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
res20_r = 0;
res20_i = 0;
res21_r = 0;
res21_i = 0;
res30_r = 0;
res30_i = 0;
res31_r = 0;
res31_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
MADD(res21, a1, b2);
MADD(res31, a1, b3);
ptrba = ptrba+4;
ptrbb = ptrbb+8;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res21, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res31, alpha);
C3 = C3 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*4*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
}
if ( bm & 1 ) // do any 1x4 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*4*2;
#endif
res00_r = 0;
res00_i = 0;
res10_r = 0;
res10_i = 0;
res20_r = 0;
res20_i = 0;
res30_r = 0;
res30_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+4; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);
ptrba = ptrba+2;
ptrbb = ptrbb+8;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*4*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 4;
#endif
k = (bk<<3);
bb = bb+k;
i = (ldc<<3);
C = C+i;
}
for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
{
C0 = C;
C1 = C0+ldc*2;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4*2;
ptrbb = bb + off*2*2;
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;
res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
res12_r = 0;
res12_i = 0;
res13_r = 0;
res13_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);
MADD(res12, a0, b1);
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);
MADD(res13, a1, b1);
ptrba = ptrba+8;
ptrbb = ptrbb+4;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res12, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res13, alpha);
C1 = C1 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
}
if ( bm & 2 ) // do any 2x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*2*2;
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
ptrba = ptrba+4;
ptrbb = ptrbb+4;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
}
if ( bm & 1 ) // do any 1x2 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*2*2;
#endif
res00_r = 0;
res00_i = 0;
res10_r = 0;
res10_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
ptrba = ptrba+2;
ptrbb = ptrbb+4;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*2*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4*2;
ptrbb = bb + off*1*2;
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);
a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);
ptrba = ptrba+8;
ptrbb = ptrbb+2;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4*2;
ptrbb += temp*1*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
}
if ( bm & 2 ) // do any 2x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*1*2;
#endif
res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*1*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
}
if ( bm & 1 ) // do any 1x1 loop
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*1*2;
#endif
res00_r = 0;
res00_i = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*1*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

40
param.h
View File

@ -2214,6 +2214,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096
#define SYMV_P 16
#endif
#if defined(CORTEXA57)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 128
#define SGEMM_DEFAULT_Q 240
#define DGEMM_DEFAULT_Q 1024
#define CGEMM_DEFAULT_Q 1024
#define ZGEMM_DEFAULT_Q 512
#define SGEMM_DEFAULT_R 12288
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048
#define SYMV_P 16 #define SYMV_P 16
#endif #endif