diff --git a/.gitignore b/.gitignore index 2c298e3b4..f5eb6ae4e 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ test/zblat2 test/zblat3 build build.* +*.swp diff --git a/Makefile.arm64 b/Makefile.arm64 index a4f8bab6b..b5170163f 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a endif +ifeq ($(CORE), CORTEXA57) +CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 +endif diff --git a/TargetList.txt b/TargetList.txt index b2878ba32..dc1e08722 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -74,3 +74,5 @@ ARMV5 7.ARM 64-bit CPU: ARMV8 +CORTEXA57 + diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 9348018dc..9d661e648 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ srandom(getpid()); #endif - for(j = 0; j < m; j++){ + for(j = 0; j < to; j++){ for(i = 0; i < to * COMPSIZE; i++){ a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; diff --git a/common.h b/common.h index 7b81c6fb6..399d01cf2 100644 --- a/common.h +++ b/common.h @@ -86,6 +86,7 @@ extern "C" { #if !defined(_MSC_VER) #include #endif +#include #ifdef OS_LINUX #include diff --git a/common_arm64.h b/common_arm64.h index 15987c677..f21e89346 100644 --- a/common_arm64.h +++ b/common_arm64.h @@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) #define PROLOGUE \ + .text ;\ + .align 4 ;\ .global REALNAME ;\ - .func REALNAME ;\ + .type REALNAME, %function ;\ REALNAME: #define EPILOGUE @@ -107,7 +109,11 @@ REALNAME: #endif #define HUGE_PAGESIZE ( 4 << 20) +#if defined(CORTEXA57) +#define BUFFER_SIZE (40 << 20) +#else #define BUFFER_SIZE (16 << 20) +#endif #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index c7a27f891..a5a0b5e0a 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -29,12 +29,19 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV8 1 +#define CPU_CORTEXA57 2 static char *cpuname[] = { - "UNKOWN", - "ARMV8" + "UNKNOWN", + "ARMV8" , + "CORTEXA57" }; +static char *cpuname_lower[] = { + "unknown", + "armv8" , + "cortexa57" +}; int get_feature(char *search) { @@ -53,13 +60,13 @@ int get_feature(char *search) { p = strchr(buffer, ':') + 2; break; - } - } + } + } - fclose(infile); + fclose(infile); - if( p == NULL ) return; + if( p == NULL ) return 0; t = strtok(p," "); while( t = strtok(NULL," ")) @@ -82,11 +89,30 @@ int detect(void) p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) + if (!strncmp("CPU part", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + if(p != NULL) { + if (strstr(p, "0xd07")) { + return CPU_CORTEXA57; + } + } + + p = (char *) NULL ; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) + { + + if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) || + (!strncmp("CPU architecture", buffer, 16))) { p = strchr(buffer, ':') + 2; break; @@ -100,7 +126,7 @@ int detect(void) if (strstr(p, "AArch64")) { - return CPU_ARMV8; + return CPU_ARMV8; } @@ -118,23 +144,13 @@ char *get_corename(void) void get_architecture(void) { - printf("ARM"); + printf("ARM64"); } void get_subarchitecture(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV8: - printf("ARMV8"); - break; - - default: - printf("UNKNOWN"); - break; - } + printf("%s", cpuname[d]); } void get_subdirname(void) @@ -160,26 +176,32 @@ void get_cpuconfig(void) printf("#define L2_ASSOCIATIVE 4\n"); break; - + case CPU_CORTEXA57: + printf("#define CORTEXA57\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + printf("#define HAVE_NEON\n"); + printf("#define HAVE_VFPV4\n"); + printf("#define L1_CODE_SIZE 49152\n"); + printf("#define L1_CODE_LINESIZE 64\n"); + printf("#define L1_CODE_ASSOCIATIVE 3\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 64\n"); + printf("#define L1_DATA_ASSOCIATIVE 2\n"); + printf("#define L2_SIZE 2097152\n"); + printf("#define L2_LINESIZE 64\n"); + printf("#define L2_ASSOCIATIVE 16\n"); + break; } } void get_libname(void) { - int d = detect(); - switch (d) - { - - case CPU_ARMV8: - printf("armv8\n"); - break; - - } + printf("%s", cpuname_lower[d]); } - void get_features(void) { diff --git a/driver/level2/spmv_thread.c b/driver/level2/spmv_thread.c index 0f47344df..f8ae3cdcd 100644 --- a/driver/level2/spmv_thread.c +++ b/driver/level2/spmv_thread.c @@ -55,7 +55,7 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG incx, incy; + BLASLONG incx; BLASLONG m_from, m_to, i; #ifndef COMPLEX FLOAT result; @@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F y = (FLOAT *)args -> c; incx = args -> ldb; - incy = args -> ldc; m_from = 0; m_to = args -> m; diff --git a/driver/level2/spr2_thread.c b/driver/level2/spr2_thread.c index 10edb1eb1..b72524a0d 100644 --- a/driver/level2/spr2_thread.c +++ b/driver/level2/spr2_thread.c @@ -43,7 +43,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG lda, incx, incy; + BLASLONG incx, incy; BLASLONG i, m_from, m_to; FLOAT alpha_r; #ifdef COMPLEX @@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL incx = args -> lda; incy = args -> ldb; - lda = args -> ldc; alpha_r = *((FLOAT *)args -> alpha + 0); #ifdef COMPLEX diff --git a/driver/level2/spr_thread.c b/driver/level2/spr_thread.c index 4a194cbd6..b1a066867 100644 --- a/driver/level2/spr_thread.c +++ b/driver/level2/spr_thread.c @@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL BLASLONG incx; BLASLONG i, m_from, m_to; FLOAT alpha_r; -#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) +#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) FLOAT alpha_i; #endif @@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL incx = args -> lda; alpha_r = *((FLOAT *)args -> alpha + 0); -#if defined(COMPLEX) && !defined(HER) && !defined(HERREV) +#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) alpha_i = *((FLOAT *)args -> alpha + 1); #endif diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c index 95d6c9bb5..6580178f1 100644 --- a/driver/level2/symv_thread.c +++ b/driver/level2/symv_thread.c @@ -55,7 +55,7 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ FLOAT *a, *x, *y; - BLASLONG lda, incx, incy; + BLASLONG lda, incx; BLASLONG m_from, m_to; a = (FLOAT *)args -> a; @@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F lda = args -> lda; incx = args -> ldb; - incy = args -> ldc; m_from = 0; m_to = args -> m; diff --git a/driver/level2/tbmv_L.c b/driver/level2/tbmv_L.c index b41b4141e..e40e79396 100644 --- a/driver/level2/tbmv_L.c +++ b/driver/level2/tbmv_L.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbmv_U.c b/driver/level2/tbmv_U.c index 50c10326b..529fd863f 100644 --- a/driver/level2/tbmv_U.c +++ b/driver/level2/tbmv_U.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbsv_L.c b/driver/level2/tbsv_L.c index 0d036440d..f62400b5e 100644 --- a/driver/level2/tbsv_L.c +++ b/driver/level2/tbsv_L.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tbsv_U.c b/driver/level2/tbsv_U.c index 1dc1a99e7..1dc7f2006 100644 --- a/driver/level2/tbsv_U.c +++ b/driver/level2/tbsv_U.c @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/tpsv_L.c b/driver/level2/tpsv_L.c index 3fafa9054..7baf5b73e 100644 --- a/driver/level2/tpsv_L.c +++ b/driver/level2/tpsv_L.c @@ -43,12 +43,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/tpsv_U.c b/driver/level2/tpsv_U.c index fb5ef02b2..763ea07b1 100644 --- a/driver/level2/tpsv_U.c +++ b/driver/level2/tpsv_U.c @@ -43,12 +43,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztbmv_L.c b/driver/level2/ztbmv_L.c index 1ac1cdef1..e7bd35796 100644 --- a/driver/level2/ztbmv_L.c +++ b/driver/level2/ztbmv_L.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbmv_U.c b/driver/level2/ztbmv_U.c index 9aa203396..c2d810a04 100644 --- a/driver/level2/ztbmv_U.c +++ b/driver/level2/ztbmv_U.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbsv_L.c b/driver/level2/ztbsv_L.c index 9aa701841..44329f5c7 100644 --- a/driver/level2/ztbsv_L.c +++ b/driver/level2/ztbsv_L.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztbsv_U.c b/driver/level2/ztbsv_U.c index 3722b1f71..530194aa3 100644 --- a/driver/level2/ztbsv_U.c +++ b/driver/level2/ztbsv_U.c @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ BLASLONG i; - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; BLASLONG length; #if (TRANSA == 2) || (TRANSA == 4) @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); COPY_K(n, b, incb, buffer, 1); } diff --git a/driver/level2/ztpmv_L.c b/driver/level2/ztpmv_L.c index 47e6df56c..76a7b8ca1 100644 --- a/driver/level2/ztpmv_L.c +++ b/driver/level2/ztpmv_L.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpmv_U.c b/driver/level2/ztpmv_U.c index da911fb4e..290b9ef40 100644 --- a/driver/level2/ztpmv_U.c +++ b/driver/level2/ztpmv_U.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT atemp1, atemp2, btemp1, btemp2; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpsv_L.c b/driver/level2/ztpsv_L.c index a497e42a4..5ce07f43b 100644 --- a/driver/level2/ztpsv_L.c +++ b/driver/level2/ztpsv_L.c @@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level2/ztpsv_U.c b/driver/level2/ztpsv_U.c index 28b824e3a..fa9d99054 100644 --- a/driver/level2/ztpsv_U.c +++ b/driver/level2/ztpsv_U.c @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ #ifndef UNIT FLOAT ar, ai, br, bi, ratio, den; #endif - FLOAT *gemvbuffer = (FLOAT *)buffer; FLOAT *B = b; if (incb != 1) { B = buffer; - gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); COPY_K(m, b, incb, buffer, 1); } diff --git a/driver/level3/gemm_thread_mn.c b/driver/level3/gemm_thread_mn.c index 2966eac82..6b52df884 100644 --- a/driver/level3/gemm_thread_mn.c +++ b/driver/level3/gemm_thread_mn.c @@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( blas_queue_t queue[MAX_CPU_NUMBER]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; - BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; + BLASLONG procs, num_cpu_m, num_cpu_n; BLASLONG width, i, j; BLASLONG divM, divN; diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6162a9f0d..096342a32 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, BLASLONG is, min_i, div_n; BLASLONG i, current; - BLASLONG l1stride, l2size; + BLASLONG l1stride; #ifdef TIMING BLASULONG rpcc_counter; @@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #endif ) return 0; - l2size = GEMM_P * GEMM_Q; - #if 0 fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", mypos, m_from, m_to, n_from, n_to, N_from, N_to); @@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO n = n_to - n_from; } - if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { + if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) { GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); return 0; } diff --git a/driver/others/memory.c b/driver/others/memory.c index ba3dc8a23..3d83a4037 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -914,7 +914,6 @@ static volatile struct { } memory[NUM_BUFFERS]; static int memory_initialized = 0; -static void gotoblas_memory_init(void); /* Memory allocation routine */ /* procpos ... indicates where it comes from */ diff --git a/getarch.c b/getarch.c index 0a49fd1b3..024ac4b3d 100644 --- a/getarch.c +++ b/getarch.c @@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " #define LIBNAME "armv8" -#define CORENAME "XGENE1" -#else +#define CORENAME "ARMV8" #endif +#ifdef FORCE_CORTEXA57 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "ARMV8" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DCORTEXA57 " \ + "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa57" +#define CORENAME "CORTEXA57" +#else +#endif #ifndef FORCE diff --git a/interface/symm.c b/interface/symm.c index 959a4ebbc..3210d371a 100644 --- a/interface/symm.c +++ b/interface/symm.c @@ -91,6 +91,27 @@ #endif #endif + +#ifdef SMP +#ifndef COMPLEX +#ifdef XDOUBLE +#define MODE (BLAS_XDOUBLE | BLAS_REAL) +#elif defined(DOUBLE) +#define MODE (BLAS_DOUBLE | BLAS_REAL) +#else +#define MODE (BLAS_SINGLE | BLAS_REAL) +#endif +#else +#ifdef XDOUBLE +#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX) +#elif defined(DOUBLE) +#define MODE (BLAS_DOUBLE | BLAS_COMPLEX) +#else +#define MODE (BLAS_SINGLE | BLAS_COMPLEX) +#endif +#endif +#endif + static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef GEMM3M #ifndef HEMM @@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO, FLOAT *buffer; FLOAT *sa, *sb; -#ifdef SMP -#ifndef COMPLEX -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_REAL; -#else - int mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif -#endif - #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif @@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, FLOAT *buffer; FLOAT *sa, *sb; -#ifdef SMP -#ifndef COMPLEX -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_REAL; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_REAL; -#else - int mode = BLAS_SINGLE | BLAS_REAL; -#endif -#else -#ifdef XDOUBLE - int mode = BLAS_XDOUBLE | BLAS_COMPLEX; -#elif defined(DOUBLE) - int mode = BLAS_DOUBLE | BLAS_COMPLEX; -#else - int mode = BLAS_SINGLE | BLAS_COMPLEX; -#endif -#endif -#endif - #if defined(SMP) && !defined(NO_AFFINITY) int nodes; #endif @@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, args.nthreads /= nodes; - gemm_thread_mn(mode, &args, NULL, NULL, + gemm_thread_mn(MODE, &args, NULL, NULL, symm[4 | (side << 1) | uplo ], sa, sb, nodes); } else { @@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, #else - GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); + GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); #endif diff --git a/interface/syr.c b/interface/syr.c index b29a81ec6..1374bcc69 100644 --- a/interface/syr.c +++ b/interface/syr.c @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/syr2.c b/interface/syr2.c index 006567c82..08fd47e57 100644 --- a/interface/syr2.c +++ b/interface/syr2.c @@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zhemv.c b/interface/zhemv.c index c60eedc57..35d29baea 100644 --- a/interface/zhemv.c +++ b/interface/zhemv.c @@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA FLOAT beta_i = BETA[1]; FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zher.c b/interface/zher.c index 9bedb0131..2e4f0cb33 100644 --- a/interface/zher.c +++ b/interface/zher.c @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/interface/zher2.c b/interface/zher2.c index b342457a0..2717c57b3 100644 --- a/interface/zher2.c +++ b/interface/zher2.c @@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA FLOAT alpha_r = ALPHA[0]; FLOAT alpha_i = ALPHA[1]; FLOAT *buffer; - int trans, uplo; + int uplo; blasint info; #ifdef SMP int nthreads; @@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA PRINT_DEBUG_CNAME; - trans = -1; uplo = -1; info = 0; diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 7c7cb2770..a8f9cf097 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ @@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ + $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ diff --git a/kernel/arm/amax.c b/kernel/arm/amax.c index ec6b11196..792e68bd9 100644 --- a/kernel/arm/amax.c +++ b/kernel/arm/amax.c @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT maxf=0.0; - if (n < 0 || inc_x < 1 ) return(maxf); + if (n <= 0 || inc_x <= 0) return(maxf); maxf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > maxf ) { maxf = ABS(x[ix]); } diff --git a/kernel/arm/amin.c b/kernel/arm/amin.c index fc89604d5..78495a8e3 100644 --- a/kernel/arm/amin.c +++ b/kernel/arm/amin.c @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT minf=0.0; - if (n < 0 || inc_x < 1 ) return(minf); + if (n <= 0 || inc_x <= 0) return(minf); minf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) < ABS(minf) ) + if( ABS(x[ix]) < minf ) { minf = ABS(x[ix]); } diff --git a/kernel/arm/asum.c b/kernel/arm/asum.c index 5b6e6ebd2..b284ae3fc 100644 --- a/kernel/arm/asum.c +++ b/kernel/arm/asum.c @@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; FLOAT sumf = 0.0; - if (n < 0 || inc_x < 1 ) return(sumf); + if (n <= 0 || inc_x <= 0) return(sumf); n *= inc_x; while(i < n) diff --git a/kernel/arm/iamax.c b/kernel/arm/iamax.c index d211776e9..8c016ce4d 100644 --- a/kernel/arm/iamax.c +++ b/kernel/arm/iamax.c @@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; BLASLONG max=0; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); maxf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { - if( ABS(x[ix]) > ABS(maxf) ) + if( ABS(x[ix]) > maxf ) { max = i; maxf = ABS(x[ix]); diff --git a/kernel/arm/iamin.c b/kernel/arm/iamin.c index 7efce19b1..155292bd5 100644 --- a/kernel/arm/iamin.c +++ b/kernel/arm/iamin.c @@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; BLASLONG min=0; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); minf=ABS(x[0]); + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/imax.c b/kernel/arm/imax.c index 28022f67b..5072dd16e 100644 --- a/kernel/arm/imax.c +++ b/kernel/arm/imax.c @@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT maxf=0.0; BLASLONG max=0; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); maxf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index fe8aa962a..598cba387 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT minf=0.0; BLASLONG min=0; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); minf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/izamax.c b/kernel/arm/izamax.c index 54bb35149..8fe33e95b 100644 --- a/kernel/arm/izamax.c +++ b/kernel/arm/izamax.c @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT maxf[2]; + FLOAT maxf; BLASLONG max=0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(max); + if (n <= 0 || inc_x <= 0) return(max); inc_x2 = 2 * inc_x; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) > CABS1(maxf,0) ) + if( CABS1(x,ix) > maxf ) { max = i; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,ix); } ix += inc_x2; i++; diff --git a/kernel/arm/izamin.c b/kernel/arm/izamin.c index 448b3cbfc..fb5a0d4cb 100644 --- a/kernel/arm/izamin.c +++ b/kernel/arm/izamin.c @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT minf[2]; + FLOAT minf; BLASLONG min=0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(min); + if (n <= 0 || inc_x <= 0) return(min); inc_x2 = 2 * inc_x; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < minf ) { min = i; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,ix); } ix += inc_x2; i++; diff --git a/kernel/arm/max.c b/kernel/arm/max.c index 04529dbd6..2ad956bc0 100644 --- a/kernel/arm/max.c +++ b/kernel/arm/max.c @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT maxf=0.0; - if (n < 0 || inc_x < 1 ) return(maxf); + if (n <= 0 || inc_x <= 0) return(maxf); maxf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/min.c b/kernel/arm/min.c index 63c704c79..2812fe397 100644 --- a/kernel/arm/min.c +++ b/kernel/arm/min.c @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG ix=0; FLOAT minf=0.0; - if (n < 0 || inc_x < 1 ) return(minf); + if (n <= 0 || inc_x <= 0) return(minf); minf=x[0]; + ix += inc_x; + i++; while(i < n) { diff --git a/kernel/arm/nrm2.c b/kernel/arm/nrm2.c index b4d810d53..fcff09337 100644 --- a/kernel/arm/nrm2.c +++ b/kernel/arm/nrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) FLOAT absxi = 0.0; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); if ( n == 1 ) return( ABS(x[0]) ); n *= inc_x; diff --git a/kernel/arm/zamax.c b/kernel/arm/zamax.c index 162f829b8..a39bd7821 100644 --- a/kernel/arm/zamax.c +++ b/kernel/arm/zamax.c @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT maxf[2]; - BLASLONG max=0; + FLOAT maxf; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) > CABS1(maxf,0) ) + if( CABS1(x,ix) > maxf ) { - max = i; - maxf[0] = ABS(x[ix]); - maxf[1] = ABS(x[ix+1]); + maxf = CABS1(x,ix); } ix += inc_x2; i++; } - return(CABS1(maxf,0)); + return(maxf); } diff --git a/kernel/arm/zamin.c b/kernel/arm/zamin.c index 9e26a66d0..02eab3e75 100644 --- a/kernel/arm/zamin.c +++ b/kernel/arm/zamin.c @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; BLASLONG ix=0; - FLOAT minf[2]; - BLASLONG min=0; + FLOAT minf; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,0); + ix += inc_x2; + i++; while(i < n) { - if( CABS1(x,ix) < CABS1(minf,0) ) + if( CABS1(x,ix) < minf ) { - min = i; - minf[0] = ABS(x[ix]); - minf[1] = ABS(x[ix+1]); + minf = CABS1(x,ix); } ix += inc_x2; i++; } - return(CABS1(minf,0)); + return(minf); } diff --git a/kernel/arm/zasum.c b/kernel/arm/zasum.c index 0c5d69e35..61e85cae6 100644 --- a/kernel/arm/zasum.c +++ b/kernel/arm/zasum.c @@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG i=0; FLOAT sumf = 0.0; BLASLONG inc_x2; - if (n < 0 || inc_x < 1 ) return(sumf); + + if (n <= 0 || inc_x <= 0) return(sumf); inc_x2 = 2 * inc_x; diff --git a/kernel/arm/zaxpby.c b/kernel/arm/zaxpby.c index d9948349d..445354416 100644 --- a/kernel/arm/zaxpby.c +++ b/kernel/arm/zaxpby.c @@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL BLASLONG i=0; BLASLONG ix,iy; FLOAT temp; + BLASLONG inc_x2, inc_y2; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); + if ( n <= 0 ) return(0); ix = 0; iy = 0; diff --git a/kernel/arm/znrm2.c b/kernel/arm/znrm2.c index c590095e7..fc1c8b54a 100644 --- a/kernel/arm/znrm2.c +++ b/kernel/arm/znrm2.c @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG inc_x2; FLOAT temp; - if (n < 0 || inc_x < 1 ) return(0.0); + if (n <= 0 || inc_x <= 0) return(0.0); inc_x2 = 2 * inc_x; diff --git a/kernel/arm64/KERNEL.CORTEXA57 b/kernel/arm64/KERNEL.CORTEXA57 new file mode 100644 index 000000000..4d18deefb --- /dev/null +++ b/kernel/arm64/KERNEL.CORTEXA57 @@ -0,0 +1,91 @@ +include $(KERNELDIR)/KERNEL.ARMV8 + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = isamax.S +IDAMAXKERNEL = idamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +DOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S + +SNRM2KERNEL = snrm2.S +DNRM2KERNEL = dnrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +STRMMKERNEL = strmm_kernel_4x4.S +DTRMMKERNEL = dtrmm_kernel_4x4.S +CTRMMKERNEL = ctrmm_kernel_4x4.S +ZTRMMKERNEL = ztrmm_kernel_4x4.S + +SGEMMKERNEL = sgemm_kernel_4x4.S +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + +DGEMMKERNEL = dgemm_kernel_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = cgemm_kernel_4x4.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = zgemm_kernel_4x4.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o + diff --git a/kernel/arm64/amax.S b/kernel/arm64/amax.S new file mode 100644 index 000000000..c02321ae0 --- /dev/null +++ b/kernel/arm64/amax.S @@ -0,0 +1,249 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define REG0 wzr +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_F1 + ldr MAXF, [X], #SZ +#if defined(USE_ABS) + fabs MAXF, MAXF +#endif +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ +#if defined(USE_ABS) + fabs TMPF, TMPF +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_F4 +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 +#if defined(USE_ABS) + fabs v0.4s, v0.4s +#endif +#if defined(USE_MIN) + fminv MAXF, v0.4s +#else + fmaxv MAXF, v0.4s +#endif +#else // DOUBLE + ld2 {v0.2d,v1.2d}, [X], #32 +#if defined(USE_ABS) + fabs v0.2d, v0.2d + fabs v1.2d, v1.2d +#endif +#if defined(USE_MIN) + fmin v0.2d, v0.2d, v1.2d + fminp MAXF, v0.2d +#else + fmax v0.2d, v0.2d, v1.2d + fmaxp MAXF, v0.2d +#endif +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v1.4s}, [X], #16 +#if defined(USE_ABS) + fabs v1.4s, v1.4s +#endif +#if defined(USE_MIN) + fminv TMPF, v1.4s +#else + fmaxv TMPF, v1.4s +#endif +#else // DOUBLE + ld2 {v1.2d,v2.2d}, [X], #32 +#if defined(USE_ABS) + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d +#endif +#if defined(USE_MIN) + fmin v1.2d, v1.2d, v2.2d + fminp TMPF, v1.2d +#else + fmax v1.2d, v1.2d, v2.2d + fmaxp TMPF, v1.2d +#endif +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + ld1 {v0.s}[0], [X], INC_X +#else + lsl INC_X, INC_X, #3 + ld1 {v0.d}[0], [X], INC_X +#endif +#if defined(USE_ABS) + fabs MAXF, MAXF +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X +#if defined(USE_ABS) + fabs TMPF, TMPF +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble amax_kernel_zero + cmp INC_X, xzr + ble amax_kernel_zero + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + +amax_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq amax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq amax_kernel_F1 + +amax_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + ret + +amax_kernel_F1_INIT: + + INIT_F1 + subs N, N, #1 + b amax_kernel_F1 + +amax_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble amax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble amax_kernel_S1 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + +amax_kernel_L999: + + ret + +amax_kernel_zero: + + fmov MAXF, REG0 + ret + + EPILOGUE diff --git a/kernel/arm64/asum.S b/kernel/arm64/asum.S new file mode 100644 index 000000000..bee8927b1 --- /dev/null +++ b/kernel/arm64/asum.S @@ -0,0 +1,194 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fabs TMPF, TMPF + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] + fabs v1.4s, v1.4s // ABS() each value + fabs v2.4s, v2.4s // ABS() each value + fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] + fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] + PRFM PLDL1KEEP, [X, #1024] +#else // DOUBLE + ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] + add X, X, #64 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + PRFM PLDL1KEEP, [X, #1024] + + fadd v2.2d, v2.2d, v3.2d + fadd v4.2d, v4.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v4.2d +#endif +.endm + +.macro KERNEL_F8_FINALIZE +#if !defined(DOUBLE) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +#else + faddp SUMF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fabs TMPF, TMPF + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 +#if !defined(DOUBLE) + fmov s1, SUMF +#else + fmov d1, SUMF +#endif + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne asum_kernel_F8 + + KERNEL_F8_FINALIZE + +asum_kernel_F1: + + ands I, N, #7 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/axpy.S b/kernel/arm64/axpy.S new file mode 100644 index 000000000..554902c09 --- /dev/null +++ b/kernel/arm64/axpy.S @@ -0,0 +1,209 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA s0 /* scale input value */ +#define TMPX s1 +#define TMPVX {v1.s}[0] +#define TMPY s2 +#define TMPVY {v2.s}[0] +#define SZ 4 +#else +#define DA d0 /* scale input value */ +#define TMPX d1 +#define TMPVX {v1.d}[0] +#define TMPY d2 +#define TMPVY {v2.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + + ldr TMPX, [X], #SZ + ldr TMPY, [Y] + fmadd TMPY, TMPX, DA, TMPY + str TMPY, [Y], #SZ + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v1.4s}, [X], #16 + ld1 {v2.4s}, [Y] + fmla v2.4s, v1.4s, v0.s[0] + st1 {v2.4s}, [Y], #16 +#else // DOUBLE + ld1 {v1.2d, v2.2d}, [X], #32 + ld1 {v3.2d, v4.2d}, [Y] + fmla v3.2d, v1.2d, v0.d[0] + fmla v4.2d, v2.2d, v0.d[0] + st1 {v3.2d, v4.2d}, [Y], #32 +#endif + +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 + ld1 {v3.4s, v4.4s}, [Y] + + fmla v3.4s, v1.4s, v0.s[0] + fmla v4.4s, v2.4s, v0.s[0] + + st1 {v3.4s, v4.4s}, [Y], #32 +#else // DOUBLE + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y] + + fmla v16.2d, v1.2d, v0.d[0] + fmla v17.2d, v2.2d, v0.d[0] + fmla v18.2d, v3.2d, v0.d[0] + fmla v19.2d, v4.2d, v0.d[0] + + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64 +#endif + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif + +.endm + +.macro KERNEL_S1 + + ld1 TMPVX, [X], INC_X + ldr TMPY, [Y] + fmadd TMPY, TMPX, DA, TMPY + st1 TMPVY, [Y], INC_Y + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble axpy_kernel_L999 + + fcmp DA, #0.0 + beq axpy_kernel_L999 + + cmp INC_X, #1 + bne axpy_kernel_S_BEGIN + cmp INC_Y, #1 + bne axpy_kernel_S_BEGIN + +axpy_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq axpy_kernel_F1 + +axpy_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne axpy_kernel_F8 + +axpy_kernel_F1: + + ands I, N, #7 + ble axpy_kernel_L999 + +axpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne axpy_kernel_F10 + + mov w0, wzr + ret + +axpy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble axpy_kernel_S1 + +axpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S4 + +axpy_kernel_S1: + + ands I, N, #3 + ble axpy_kernel_L999 + +axpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne axpy_kernel_S10 + +axpy_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/casum.S b/kernel/arm64/casum.S new file mode 100644 index 000000000..8f09eecfa --- /dev/null +++ b/kernel/arm64/casum.S @@ -0,0 +1,170 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 wzr +#define SUMF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2s}, [X], #8 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] + add X, X, #64 + fabs v1.4s, v1.4s + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + + PRFM PLDL1KEEP, [X, #1024] + + fadd v1.4s, v1.4s, v2.4s + fadd v3.4s, v3.4s, v4.4s + fadd v0.4s, v0.4s, v1.4s + fadd v0.4s, v0.4s, v3.4s +.endm + +.macro KERNEL_F8_FINALIZE + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SUMF, v0.2s +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 +.endm + +.macro KERNEL_S1 + ld1 {v1.2s}, [X], INC_X + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, TMPF, s2 + fadd SUMF, SUMF, TMPF + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + fmov s1, SUMF + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne asum_kernel_F8 + + KERNEL_F8_FINALIZE + +asum_kernel_F1: + + ands I, N, #7 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/cgemm_kernel_4x4.S b/kernel/arm64/cgemm_kernel_4x4.S new file mode 100644 index 000000000..cec238467 --- /dev/null +++ b/kernel/arm64/cgemm_kernel_4x4.S @@ -0,0 +1,1667 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define ppC x16 +#define ppA x17 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 ppC +// 17 ppA +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R +//v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I +//v02 ppA00_R, ppA01_R, ppA02_R, ppA03_R +//v03 ppA00_I, ppA01_I, ppA02_I, ppA03_I +//v04 pA10_R, pA11_R, pA12_R, pA13_R +//v05 pA10_I, pA11_I, pA12_I, pA13_I +//v06 ppA10_R, ppA11_R, ppA12_R, ppA13_R +//v07 ppA10_I, ppA11_I, ppA12_I, ppA13_I +//v08 must save pB00_R, pB01_R, pB02_R, pB03_R +//v09 must save pB00_I, pB01_I, pB02_I, pB03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB10_R, pB11_R, pB12_R, pB13_R +//v13 must save pB10_I, pB11_I, pB12_I, pB13_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC00_R, pC01_R, pC02_R, pC03_R +//v17 must save pC00_I, pC01_I, pC02_I, pC03_I +//v18 ppC00_R, ppC01_R, ppC02_R, ppC03_R +//v19 ppC00_I, ppC01_I, ppC02_I, ppC03_I +//v20 pC10_R, pC11_R, pC12_R, pC13_R +//v21 pC10_I, pC11_I, pC12_I, pC13_I +//v22 ppC10_R, ppC11_R, ppC12_R, ppC13_R +//v23 ppC10_I, ppC11_I, ppC12_I, ppC13_I +//v24 pC20_R, pC21_R, pC22_R, pC23_R +//v25 pC20_I, pC21_I, pC22_I, pC23_I +//v26 ppC20_R, ppC21_R, ppC22_R, ppC23_R +//v27 ppC20_I, ppC21_I, ppC22_I, ppC23_I +//v28 pC30_R, pC31_R, pC32_R, pC33_R +//v29 pC30_I, pC31_I, pC32_I, pC33_I +//v30 ppC30_R, ppC31_R, ppC32_R, ppC33_R +//v31 ppC30_I, ppC31_I, ppC32_I, ppC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm + +.macro KERNEL8x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + ld2 {v2.4s, v3.4s}, [ppA] + add ppA, ppA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v17.4s, v17.4s +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v21.4s, v21.4s +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v25.4s, v25.4s +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v29.4s, v29.4s +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + fmul v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + fmul v19.4s, v2.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v19.4s, v19.4s +#endif + OP_ir v19.4s, v3.4s, v8.4s[0] + + fmul v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + fmul v23.4s, v2.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v23.4s, v23.4s +#endif + OP_ir v23.4s, v3.4s, v8.4s[1] + + fmul v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + fmul v27.4s, v2.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v27.4s, v27.4s +#endif + OP_ir v27.4s, v3.4s, v8.4s[2] + + fmul v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + fmul v31.4s, v2.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v31.4s, v31.4s +#endif + OP_ir v31.4s, v3.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s} , [pA] + add pA, pA, #32 + ld2 {v6.4s, v7.4s} , [ppA] + add ppA, ppA, #32 +.endm + +.macro KERNEL8x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // for next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + ld2 {v4.4s, v5.4s} , [pA] // for next round + add pA, pA, #32 + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + ld2 {v6.4s, v7.4s} , [ppA] // for next round + add ppA, ppA, #32 + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + prfm PLDL1KEEP, [ppA, #512] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] +.endm + +.macro KERNEL8x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // for next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + ld2 {v0.4s, v1.4s}, [pA] // for next round + add pA, pA, #32 + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + prfm PLDL1KEEP, [ppA, #512] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + ld2 {v2.4s, v3.4s}, [ppA] // for next round + add ppA, ppA, #32 + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] +.endm + +.macro KERNEL8x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] + + OP_rr v18.4s, v6.4s, v12.4s[0] + OP_ii v18.4s, v7.4s, v13.4s[0] + OP_ri v19.4s, v6.4s, v13.4s[0] + OP_ir v19.4s, v7.4s, v12.4s[0] + + OP_rr v22.4s, v6.4s, v12.4s[1] + OP_ii v22.4s, v7.4s, v13.4s[1] + OP_ri v23.4s, v6.4s, v13.4s[1] + OP_ir v23.4s, v7.4s, v12.4s[1] + + OP_rr v26.4s, v6.4s, v12.4s[2] + OP_ii v26.4s, v7.4s, v13.4s[2] + OP_ri v27.4s, v6.4s, v13.4s[2] + OP_ir v27.4s, v7.4s, v12.4s[2] + + OP_rr v30.4s, v6.4s, v12.4s[3] + OP_ii v30.4s, v7.4s, v13.4s[3] + OP_ri v31.4s, v6.4s, v13.4s[3] + OP_ir v31.4s, v7.4s, v12.4s[3] +.endm + +.macro KERNEL8x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v2.4s, v3.4s}, [ppA] + add ppA, ppA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] + + OP_rr v18.4s, v2.4s, v8.4s[0] + OP_ii v18.4s, v3.4s, v9.4s[0] + OP_ri v19.4s, v2.4s, v9.4s[0] + OP_ir v19.4s, v3.4s, v8.4s[0] + + OP_rr v22.4s, v2.4s, v8.4s[1] + OP_ii v22.4s, v3.4s, v9.4s[1] + OP_ri v23.4s, v2.4s, v9.4s[1] + OP_ir v23.4s, v3.4s, v8.4s[1] + + OP_rr v26.4s, v2.4s, v8.4s[2] + OP_ii v26.4s, v3.4s, v9.4s[2] + OP_ri v27.4s, v2.4s, v9.4s[2] + OP_ir v27.4s, v3.4s, v8.4s[2] + + OP_rr v30.4s, v2.4s, v8.4s[3] + OP_ii v30.4s, v3.4s, v9.4s[3] + OP_ri v31.4s, v2.4s, v9.4s[3] + OP_ir v31.4s, v3.4s, v8.4s[3] +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + add pCRow2, pCRow1, #32 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v18.4s, alphaV0_R + fmls v2.4s, v19.4s, alphaV0_I + fmla v3.4s, v18.4s, alphaV1_I + fmla v3.4s, v19.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v22.4s, alphaV0_R + fmls v6.4s, v23.4s, alphaV0_I + fmla v7.4s, v22.4s, alphaV1_I + fmla v7.4s, v23.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v2.4s, v3.4s}, [pCRow2] + fmla v2.4s, v26.4s, alphaV0_R + fmls v2.4s, v27.4s, alphaV0_I + fmla v3.4s, v26.4s, alphaV1_I + fmla v3.4s, v27.4s, alphaV1_R + st2 {v2.4s, v3.4s}, [pCRow2] + + add pCRow2, pCRow1, #32 + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v6.4s, v7.4s}, [pCRow2] + fmla v6.4s, v30.4s, alphaV0_R + fmls v6.4s, v31.4s, alphaV0_I + fmla v7.4s, v30.4s, alphaV1_I + fmla v7.4s, v31.4s, alphaV1_R + st2 {v6.4s, v7.4s}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmla v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmla v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmla v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmla v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmla s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmla s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.4s, v5.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmla v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2s, v5.2s}, [pCRow1] + fmla v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmla v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.s, v5.s}[0], [pCRow1] + fmla s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmla s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + ld2 {v0.4s, v1.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmla v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + ld2 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmla v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + ld2 {v0.s, v1.s}[0], [pCRow1] + fmla s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmla s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble cgemm_kernel_L2_BEGIN + +/******************************************************************************/ + +cgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + lsl temp, origK, #5 // k * 4 * 8 + mov pA, origPA // pA = start of A array + add ppA, temp, pA + +cgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble cgemm_kernel_L4_M4_BEGIN + +cgemm_kernel_L4_M8_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt cgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble cgemm_kernel_L4_M8_22a + .align 5 + +cgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M8_22 + + +cgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + +cgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble cgemm_kernel_L4_M8_40 + + KERNEL8x4_I + KERNEL8x4_E + + b cgemm_kernel_L4_M8_44 + + +cgemm_kernel_L4_M8_40: + + INIT8x4 + +cgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble cgemm_kernel_L4_M8_100 + +cgemm_kernel_L4_M8_46: + KERNEL8x4_SUB + +cgemm_kernel_L4_M8_100: + + SAVE8x4 + +cgemm_kernel_L4_M8_END: + lsl temp, origK, #5 // k * 4 * 8 + add pA, pA, temp + add ppA, ppA, temp + subs counterI, counterI, #1 + bne cgemm_kernel_L4_M8_20 + + +cgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble cgemm_kernel_L4_END + + tst counterI, #4 + ble cgemm_kernel_L4_M2_BEGIN + +cgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L4_M4_40 + +cgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_22 + + +cgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M4_100 + +cgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M4_42 + +cgemm_kernel_L4_M4_100: + + SAVE4x4 + +cgemm_kernel_L4_M4_END: + + +cgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L4_M1_BEGIN + +cgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M2_40 + +cgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_22 + + +cgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M2_100 + +cgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M2_42 + +cgemm_kernel_L4_M2_100: + + SAVE2x4 + +cgemm_kernel_L4_M2_END: + + +cgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L4_END + +cgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L4_M1_40 + +cgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_22 + + +cgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L4_M1_100 + +cgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L4_M1_42 + +cgemm_kernel_L4_M1_100: + + SAVE1x4 + + +cgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt cgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +cgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble cgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble cgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +cgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble cgemm_kernel_L2_M2_BEGIN + +cgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M4_40 + .align 5 + +cgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_22 + + +cgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M4_100 + +cgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M4_42 + +cgemm_kernel_L2_M4_100: + + SAVE4x2 + +cgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L2_M4_20 + + +cgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L2_M1_BEGIN + +cgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble cgemm_kernel_L2_M2_40 + +cgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_22 + + +cgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M2_100 + +cgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M2_42 + +cgemm_kernel_L2_M2_100: + + SAVE2x2 + +cgemm_kernel_L2_M2_END: + + +cgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L2_END + +cgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble cgemm_kernel_L2_M1_40 + +cgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_22 + + +cgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L2_M1_100 + +cgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L2_M1_42 + +cgemm_kernel_L2_M1_100: + + SAVE1x2 + + +cgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +cgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble cgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +cgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble cgemm_kernel_L1_M2_BEGIN + +cgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M4_40 + .align 5 + +cgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_22 + + +cgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M4_100 + +cgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M4_42 + +cgemm_kernel_L1_M4_100: + + SAVE4x1 + +cgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt cgemm_kernel_L1_M4_20 + + +cgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble cgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble cgemm_kernel_L1_M1_BEGIN + +cgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M2_40 + +cgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_22 + + +cgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M2_100 + +cgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M2_42 + +cgemm_kernel_L1_M2_100: + + SAVE2x1 + +cgemm_kernel_L1_M2_END: + + +cgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble cgemm_kernel_L1_END + +cgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble cgemm_kernel_L1_M1_40 + +cgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_22 + + +cgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble cgemm_kernel_L1_M1_100 + +cgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt cgemm_kernel_L1_M1_42 + +cgemm_kernel_L1_M1_100: + + SAVE1x1 + + +cgemm_kernel_L1_END: + + +cgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S new file mode 100644 index 000000000..17aa5a1e8 --- /dev/null +++ b/kernel/arm64/copy.S @@ -0,0 +1,232 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define TMPF s0 +#define TMPVF {v0.s}[0] +#define SZ 4 +#else +#define TMPF d0 +#define TMPVF {v0.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(COMPLEX) + ldr TMPF, [X], #SZ + str TMPF, [Y], #SZ +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X], #8 + st1 {v0.2s}, [Y], #8 +#else + ld1 {v0.2d}, [X], #16 + st1 {v0.2d}, [Y], #16 +#endif +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 +#else // DOUBLE + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 +#endif +#else // COMPLEX +#if !defined(DOUBLE) + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 +#else // DOUBLE + ld1 {v0.4s}, [X], #16 + ld1 {v1.4s}, [X], #16 + ld1 {v2.4s}, [X], #16 + ld1 {v3.4s}, [X], #16 + st1 {v0.4s}, [Y], #16 + st1 {v1.4s}, [Y], #16 + st1 {v2.4s}, [Y], #16 + st1 {v3.4s}, [Y], #16 +#endif +#endif + +.endm + +.macro INIT_S + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +#else +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ldr w10, [X] + add X, X, INC_X + str w10, [Y] + add Y, Y, INC_Y +#else + ldr x10, [X] + add X, X, INC_X + str x10, [Y] + add Y, Y, INC_Y +#endif +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X] + add X, X, INC_X + st1 {v0.2s}, [Y] + add Y, Y, INC_Y +#else + ld1 {v0.2d}, [X] + add X, X, INC_X + st1 {v0.2d}, [Y] + add Y, Y, INC_Y +#endif +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble copy_kernel_L999 + + cmp INC_X, #1 + bne copy_kernel_S_BEGIN + cmp INC_Y, #1 + bne copy_kernel_S_BEGIN + +copy_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq copy_kernel_F1 + +copy_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne copy_kernel_F4 + +copy_kernel_F1: + + ands I, N, #3 + ble copy_kernel_L999 + +copy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne copy_kernel_F10 + + mov w0, wzr + ret + +copy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble copy_kernel_S1 + +copy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne copy_kernel_S4 + +copy_kernel_S1: + + ands I, N, #3 + ble copy_kernel_L999 + +copy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne copy_kernel_S10 + +copy_kernel_L999: + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/ctrmm_kernel_4x4.S b/kernel/arm64/ctrmm_kernel_4x4.S new file mode 100644 index 000000000..7b02111e9 --- /dev/null +++ b/kernel/arm64/ctrmm_kernel_4x4.S @@ -0,0 +1,1621 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0, FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0_R s10 +#define alphaV0_R v10.s[0] +#define alpha0_I s11 +#define alphaV0_I v11.s[0] + +#define alpha1_R s14 +#define alphaV1_R v14.s[0] +#define alpha1_I s15 +#define alphaV1_I v15.s[0] + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R, pA02_R, pA03_R +//v01 ALPHA_I -> pA00_I, pA01_I, pA02_I, pA03_I +//v02 +//v03 +//v04 pA10_R, pA11_R, pA12_R, pA13_R +//v05 pA10_I, pA11_I, pA12_I, pA13_I +//v06 +//v07 +//v08 must save pB00_R, pB01_R, pB02_R, pB03_R +//v09 must save pB00_I, pB01_I, pB02_I, pB03_I +//v10 must save ALPHA0_R +//v11 must save ALPHA0_I +//v12 must save pB10_R, pB11_R, pB12_R, pB13_R +//v13 must save pB10_I, pB11_I, pB12_I, pB13_I +//v14 must save ALPHA1_R +//v15 must save ALPHA1_I +//v16 must save pC00_R, pC01_R, pC02_R, pC03_R +//v17 must save pC00_I, pC01_I, pC02_I, pC03_I +//v18 +//v19 +//v20 pC10_R, pC11_R, pC12_R, pC13_R +//v21 pC10_I, pC11_I, pC12_I, pC13_I +//v22 +//v23 +//v24 pC20_R, pC21_R, pC22_R, pC23_R +//v25 pC20_I, pC21_I, pC22_I, pC23_I +//v26 +//v27 +//v28 pC30_R, pC31_R, pC32_R, pC33_R +//v29 pC30_I, pC31_I, pC32_I, pC33_I +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + fmul v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + fmul v17.4s, v0.4s, v9.4s[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v17.4s, v17.4s +#endif + OP_ir v17.4s, v1.4s, v8.4s[0] + + fmul v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + fmul v21.4s, v0.4s, v9.4s[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v21.4s, v21.4s +#endif + OP_ir v21.4s, v1.4s, v8.4s[1] + + fmul v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + fmul v25.4s, v0.4s, v9.4s[2] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v25.4s, v25.4s +#endif + OP_ir v25.4s, v1.4s, v8.4s[2] + + fmul v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + fmul v29.4s, v0.4s, v9.4s[3] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v29.4s, v29.4s +#endif + OP_ir v29.4s, v1.4s, v8.4s[3] + + ld2 {v12.4s, v13.4s}, [pB] + add pB, pB, #32 + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + ld2 {v12.4s, v13.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + ld2 {v4.4s, v5.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + ld2 {v8.4s, v9.4s}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + ld2 {v0.4s, v1.4s}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_E + OP_rr v16.4s, v4.4s, v12.4s[0] + OP_ii v16.4s, v5.4s, v13.4s[0] + OP_ri v17.4s, v4.4s, v13.4s[0] + OP_ir v17.4s, v5.4s, v12.4s[0] + + OP_rr v20.4s, v4.4s, v12.4s[1] + OP_ii v20.4s, v5.4s, v13.4s[1] + OP_ri v21.4s, v4.4s, v13.4s[1] + OP_ir v21.4s, v5.4s, v12.4s[1] + + OP_rr v24.4s, v4.4s, v12.4s[2] + OP_ii v24.4s, v5.4s, v13.4s[2] + OP_ri v25.4s, v4.4s, v13.4s[2] + OP_ir v25.4s, v5.4s, v12.4s[2] + + OP_rr v28.4s, v4.4s, v12.4s[3] + OP_ii v28.4s, v5.4s, v13.4s[3] + OP_ri v29.4s, v4.4s, v13.4s[3] + OP_ir v29.4s, v5.4s, v12.4s[3] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.4s[0] + OP_ii v16.4s, v1.4s, v9.4s[0] + OP_ri v17.4s, v0.4s, v9.4s[0] + OP_ir v17.4s, v1.4s, v8.4s[0] + + OP_rr v20.4s, v0.4s, v8.4s[1] + OP_ii v20.4s, v1.4s, v9.4s[1] + OP_ri v21.4s, v0.4s, v9.4s[1] + OP_ir v21.4s, v1.4s, v8.4s[1] + + OP_rr v24.4s, v0.4s, v8.4s[2] + OP_ii v24.4s, v1.4s, v9.4s[2] + OP_ri v25.4s, v0.4s, v9.4s[2] + OP_ir v25.4s, v1.4s, v8.4s[2] + + OP_rr v28.4s, v0.4s, v8.4s[3] + OP_ii v28.4s, v1.4s, v9.4s[3] + OP_ri v29.4s, v0.4s, v9.4s[3] + OP_ir v29.4s, v1.4s, v8.4s[3] +.endm + +.macro SAVE4x4 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.4s, v24.4s, alphaV0_R + fmls v0.4s, v25.4s, alphaV0_I + fmul v1.4s, v24.4s, alphaV1_I + fmla v1.4s, v25.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v28.4s, alphaV0_R + fmls v4.4s, v29.4s, alphaV0_I + fmul v5.4s, v28.4s, alphaV1_I + fmla v5.4s, v29.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.4s[0] + OP_ii v16.2s, v1.2s, v9.4s[0] + OP_ri v17.2s, v0.2s, v9.4s[0] + OP_ir v17.2s, v1.2s, v8.4s[0] + + OP_rr v20.2s, v0.2s, v8.4s[1] + OP_ii v20.2s, v1.2s, v9.4s[1] + OP_ri v21.2s, v0.2s, v9.4s[1] + OP_ir v21.2s, v1.2s, v8.4s[1] + + OP_rr v24.2s, v0.2s, v8.4s[2] + OP_ii v24.2s, v1.2s, v9.4s[2] + OP_ri v25.2s, v0.2s, v9.4s[2] + OP_ir v25.2s, v1.2s, v8.4s[2] + + OP_rr v28.2s, v0.2s, v8.4s[3] + OP_ii v28.2s, v1.2s, v9.4s[3] + OP_ri v29.2s, v0.2s, v9.4s[3] + OP_ir v29.2s, v1.2s, v8.4s[3] +.endm + +.macro SAVE2x4 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.2s, v24.2s, alphaV0_R + fmls v0.2s, v25.2s, alphaV0_I + fmul v1.2s, v24.2s, alphaV1_I + fmla v1.2s, v25.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v28.2s, alphaV0_R + fmls v4.2s, v29.2s, alphaV0_I + fmul v5.2s, v28.2s, alphaV1_I + fmla v5.2s, v29.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 + fmov s24, s16 + fmov s25, s17 + fmov s28, s16 + fmov s29, s17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.4s, v9.4s}, [pB] + add pB, pB, #32 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.4s[0] + OP_ii s16, s1, v9.4s[0] + OP_ri s17, s0, v9.4s[0] + OP_ir s17, s1, v8.4s[0] + + OP_rr s20, s0, v8.4s[1] + OP_ii s20, s1, v9.4s[1] + OP_ri s21, s0, v9.4s[1] + OP_ir s21, s1, v8.4s[1] + + OP_rr s24, s0, v8.4s[2] + OP_ii s24, s1, v9.4s[2] + OP_ri s25, s0, v9.4s[2] + OP_ir s25, s1, v8.4s[2] + + OP_rr s28, s0, v8.4s[3] + OP_ii s28, s1, v9.4s[3] + OP_ri s29, s0, v9.4s[3] + OP_ir s29, s1, v8.4s[3] +.endm + +.macro SAVE1x4 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s0, s24, alphaV0_R + fmls s0, s25, alphaV0_I + fmul s1, s24, alphaV1_I + fmla s1, s25, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s28, alphaV0_R + fmls s4, s29, alphaV0_I + fmul s5, s28, alphaV1_I + fmla s5, s29, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.2s[0] + OP_ii v16.4s, v1.4s, v9.2s[0] + OP_ri v17.4s, v0.4s, v9.2s[0] + OP_ir v17.4s, v1.4s, v8.2s[0] + + OP_rr v20.4s, v0.4s, v8.2s[1] + OP_ii v20.4s, v1.4s, v9.2s[1] + OP_ri v21.4s, v0.4s, v9.2s[1] + OP_ir v21.4s, v1.4s, v8.2s[1] +.endm + +.macro SAVE4x2 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.4s, v20.4s, alphaV0_R + fmls v4.4s, v21.4s, alphaV0_I + fmul v5.4s, v20.4s, alphaV1_I + fmla v5.4s, v21.4s, alphaV1_R + st2 {v4.4s, v5.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, s16 + fmov s21, s17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.2s[0] + OP_ii v16.2s, v1.2s, v9.2s[0] + OP_ri v17.2s, v0.2s, v9.2s[0] + OP_ir v17.2s, v1.2s, v8.2s[0] + + OP_rr v20.2s, v0.2s, v8.2s[1] + OP_ii v20.2s, v1.2s, v9.2s[1] + OP_ri v21.2s, v0.2s, v9.2s[1] + OP_ir v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2s, v20.2s, alphaV0_R + fmls v4.2s, v21.2s, alphaV0_I + fmul v5.2s, v20.2s, alphaV1_I + fmla v5.2s, v21.2s, alphaV1_R + st2 {v4.2s, v5.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr + fmov s17, wzr + fmov s20, wzr + fmov s21, wzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.2s[0] + OP_ii s16, s1, v9.2s[0] + OP_ri s17, s0, v9.2s[0] + OP_ir s17, s1, v8.2s[0] + + OP_rr s20, s0, v8.2s[1] + OP_ii s20, s1, v9.2s[1] + OP_ri s21, s0, v9.2s[1] + OP_ir s21, s1, v8.2s[1] +.endm + +.macro SAVE1x2 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul s4, s20, alphaV0_R + fmls s4, s21, alphaV0_I + fmul s5, s20, alphaV1_I + fmla s5, s21, alphaV1_R + st2 {v4.s, v5.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + + OP_rr v16.4s, v0.4s, v8.s[0] + OP_ii v16.4s, v1.4s, v9.s[0] + OP_ri v17.4s, v0.4s, v9.s[0] + OP_ir v17.4s, v1.4s, v8.s[0] +.endm + +.macro SAVE4x1 + mov pCRow1, pCRow0 + + fmul v0.4s, v16.4s, alphaV0_R + fmls v0.4s, v17.4s, alphaV0_I + fmul v1.4s, v16.4s, alphaV1_I + fmla v1.4s, v17.4s, alphaV1_R + st2 {v0.4s, v1.4s}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + OP_rr v16.2s, v0.2s, v8.s[0] + OP_ii v16.2s, v1.2s, v9.s[0] + OP_ri v17.2s, v0.2s, v9.s[0] + OP_ir v17.2s, v1.2s, v8.s[0] +.endm + +.macro SAVE2x1 + mov pCRow1, pCRow0 + + fmul v0.2s, v16.2s, alphaV0_R + fmls v0.2s, v17.2s, alphaV0_I + fmul v1.2s, v16.2s, alphaV1_I + fmla v1.2s, v17.2s, alphaV1_R + st2 {v0.2s, v1.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr + fmov s17, wzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.s, v9.s}[0], [pB] + add pB, pB, #8 + ld2 {v0.s, v1.s}[0], [pA] + add pA, pA, #8 + + OP_rr s16, s0, v8.s[0] + OP_ii s16, s1, v9.s[0] + OP_ri s17, s0, v9.s[0] + OP_ir s17, s1, v8.s[0] +.endm + +.macro SAVE1x1 + mov pCRow1, pCRow0 + + fmul s0, s16, alphaV0_R + fmls s0, s17, alphaV0_I + fmul s1, s16, alphaV1_I + fmla s1, s17, alphaV1_R + st2 {v0.s, v1.s}[0], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0_R, s0 + fmov alpha0_I, s1 + fmov alpha1_R, s0 + fmov alpha1_I, s1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ctrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +ctrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ctrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ctrmm_kernel_L4_M2_BEGIN + +ctrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 + +ctrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M4_22 + + +ctrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b ctrmm_kernel_L4_M4_44 + +ctrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b ctrmm_kernel_L4_M4_44 + + +ctrmm_kernel_L4_M4_40: + + INIT4x4 + +ctrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble ctrmm_kernel_L4_M4_100 + +ctrmm_kernel_L4_M4_46: + KERNEL4x4_SUB + +ctrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne ctrmm_kernel_L4_M4_20 + +ctrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L4_M1_BEGIN + +ctrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M2_40 + +ctrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_22 + + +ctrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M2_100 + +ctrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M2_42 + +ctrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L4_M2_END: + + +ctrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L4_END + +ctrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L4_M1_40 + +ctrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_22 + + +ctrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L4_M1_100 + +ctrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L4_M1_42 + +ctrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ctrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ctrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ctrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ctrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble ctrmm_kernel_L2_M2_BEGIN + +ctrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M4_40 + .align 5 + +ctrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_22 + + +ctrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M4_100 + +ctrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M4_42 + +ctrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L2_M4_20 + + +ctrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L2_M1_BEGIN + +ctrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ctrmm_kernel_L2_M2_40 + +ctrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_22 + + +ctrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M2_100 + +ctrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M2_42 + +ctrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L2_M2_END: + + +ctrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L2_END + +ctrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ctrmm_kernel_L2_M1_40 + +ctrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_22 + + +ctrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L2_M1_100 + +ctrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L2_M1_42 + +ctrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +ctrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +ctrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ctrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ctrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ctrmm_kernel_L1_M2_BEGIN + +ctrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M4_40 + .align 5 + +ctrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_22 + + +ctrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M4_100 + +ctrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M4_42 + +ctrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ctrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt ctrmm_kernel_L1_M4_20 + + +ctrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ctrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ctrmm_kernel_L1_M1_BEGIN + +ctrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M2_40 + +ctrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_22 + + +ctrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M2_100 + +ctrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M2_42 + +ctrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ctrmm_kernel_L1_M2_END: + + +ctrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ctrmm_kernel_L1_END + +ctrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ctrmm_kernel_L1_M1_40 + +ctrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_22 + + +ctrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ctrmm_kernel_L1_M1_100 + +ctrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ctrmm_kernel_L1_M1_42 + +ctrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ctrmm_kernel_L1_END: + + +ctrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dgemm_kernel_4x4.S b/kernel/arm64/dgemm_kernel_4x4.S new file mode 100644 index 000000000..e88253af1 --- /dev/null +++ b/kernel/arm64/dgemm_kernel_4x4.S @@ -0,0 +1,1338 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define ppC x16 +#define ppCRow0 x17 +#define ppCRow1 x18 +#define ppCRow2 x19 +#define ppA x20 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 ppC +// 17 ppCRow0 +// 18 must save ppCRow1 +// 19 must save ppCRow2 +// 20 must save ppA +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ppA00, ppA01 +//v03 ppA02, ppA03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ppA10, ppA11 +//v07 ppA12, ppA13 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 ppC00, ppC01 +//v19 ppC02, ppC03 +//v20 C10, C11 +//v21 C12, C13 +//v22 ppC10, ppC11 +//v23 ppC12, ppC13 +//v24 C20, C21 +//v25 C22, C23 +//v26 ppC20, ppC21 +//v27 ppC22, ppC23 +//v28 C30, C31 +//v29 C32, C33 +//v30 ppC30, ppC31 +//v31 ppC32, ppC33 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT8x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL8x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + ld1 {v2.2d, v3.2d}, [ppA] + add ppA, ppA, #32 + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + fmul v31.2d, v3.2d, v9.2d[1] + fmul v22.2d, v2.2d, v8.2d[1] + fmul v27.2d, v3.2d, v9.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] // for next round + add pB, pB, #32 + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + ld1 {v4.2d, v5.2d} , [pA] // for next round + add pA, pA, #32 + + fmul v26.2d, v2.2d, v9.2d[0] + fmul v23.2d, v3.2d, v8.2d[1] + + ld1 {v6.2d, v7.2d} , [ppA] // for next round + add ppA, ppA, #32 + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + fmul v30.2d, v2.2d, v9.2d[1] + fmul v19.2d, v3.2d, v8.2d[0] +.endm + +.macro KERNEL8x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v18.2d, v6.2d, v12.2d[0] + fmla v31.2d, v7.2d, v13.2d[1] + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + fmla v22.2d, v6.2d, v12.2d[1] + fmla v27.2d, v7.2d, v13.2d[0] + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v26.2d, v6.2d, v13.2d[0] + fmla v23.2d, v7.2d, v12.2d[1] + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] + + ld1 {v2.2d, v3.2d}, [ppA] + add ppA, ppA, #32 + + fmla v30.2d, v6.2d, v13.2d[1] + fmla v19.2d, v7.2d, v12.2d[0] +.endm + +.macro KERNEL8x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // for next round + add pB, pB, #32 + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v31.2d, v3.2d, v9.2d[1] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + fmla v22.2d, v2.2d, v8.2d[1] + fmla v27.2d, v3.2d, v9.2d[0] + + prfm PLDL1KEEP, [ppA, #512] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + ld1 {v4.2d, v5.2d} , [pA] // for next round + add pA, pA, #32 + + fmla v26.2d, v2.2d, v9.2d[0] + fmla v23.2d, v3.2d, v8.2d[1] + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] + + ld1 {v6.2d, v7.2d} , [ppA] // for next round + add ppA, ppA, #32 + + fmla v30.2d, v2.2d, v9.2d[1] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro KERNEL8x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v25.2d, v5.2d, v13.2d[0] + fmla v18.2d, v6.2d, v12.2d[0] + fmla v27.2d, v7.2d, v13.2d[0] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v29.2d, v5.2d, v13.2d[1] + fmla v22.2d, v6.2d, v12.2d[1] + fmla v31.2d, v7.2d, v13.2d[1] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v17.2d, v5.2d, v12.2d[0] + fmla v26.2d, v6.2d, v13.2d[0] + fmla v19.2d, v7.2d, v12.2d[0] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v21.2d, v5.2d, v12.2d[1] + fmla v30.2d, v6.2d, v13.2d[1] + fmla v23.2d, v7.2d, v12.2d[1] +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v2.2d, v3.2d}, [ppA] + add ppA, ppA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] + + fmla v18.2d, v2.2d, v8.2d[0] + fmla v31.2d, v3.2d, v9.2d[1] + fmla v22.2d, v2.2d, v8.2d[1] + fmla v27.2d, v3.2d, v9.2d[0] + + fmla v26.2d, v2.2d, v9.2d[0] + fmla v23.2d, v3.2d, v8.2d[1] + fmla v30.2d, v2.2d, v9.2d[1] + fmla v19.2d, v3.2d, v8.2d[0] +.endm + +.macro SAVE8x4 + add ppCRow0, pCRow0, #32 + + ld1 {v0.2d, v1.2d}, [pCRow0] + fmla v0.2d, v16.2d, alphaV0 + fmla v1.2d, v17.2d, alphaV1 + st1 {v0.2d, v1.2d}, [pCRow0] + + ld1 {v2.2d, v3.2d}, [ppCRow0] + fmla v2.2d, v18.2d, alphaV2 + fmla v3.2d, v19.2d, alphaV3 + st1 {v2.2d, v3.2d}, [ppCRow0] + + add pCRow1, pCRow0, LDC + add ppCRow1, ppCRow0, LDC + + ld1 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0 + fmla v5.2d, v21.2d, alphaV1 + st1 {v4.2d, v5.2d}, [pCRow1] + + ld1 {v6.2d, v7.2d}, [ppCRow1] + fmla v6.2d, v22.2d, alphaV2 + fmla v7.2d, v23.2d, alphaV3 + st1 {v6.2d, v7.2d}, [ppCRow1] + + add pCRow2, pCRow1, LDC + add ppCRow2, ppCRow1, LDC + + ld1 {v0.2d, v1.2d}, [pCRow2] + fmla v0.2d, v24.2d, alphaV0 + fmla v1.2d, v25.2d, alphaV1 + st1 {v0.2d, v1.2d}, [pCRow2] + + ld1 {v2.2d, v3.2d}, [ppCRow2] + fmla v2.2d, v26.2d, alphaV2 + fmla v3.2d, v27.2d, alphaV3 + st1 {v2.2d, v3.2d}, [ppCRow2] + + add pCRow1, pCRow2, LDC + add ppCRow1, ppCRow2, LDC + + ld1 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v28.2d, alphaV0 + fmla v5.2d, v29.2d, alphaV1 + st1 {v4.2d, v5.2d}, [pCRow1] + + ld1 {v6.2d, v7.2d}, [ppCRow1] + fmla v6.2d, v30.2d, alphaV2 + fmla v7.2d, v31.2d, alphaV3 + st1 {v6.2d, v7.2d}, [ppCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d, v9.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV0 + fmla v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV2 + fmla v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + ld1 {v8.2d}, [pCRow2] + fmla v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + ld1 {v12.d}[0], [pCRow2] + ld1 {v12.d}[1], [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + ld1 {v12.2d, v13.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV2 + fmla v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + ld1 {v12.2d}, [pCRow1] + fmla v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + ld1 {v8.d}[0], [pCRow0] + ld1 {v8.d}[1], [pCRow1] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + ld1 {v8.2d, v9.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + fmla v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + ld1 {v8.2d}, [pCRow0] + fmla v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + ldr d8, [pCRow0] + fmadd d8, d16, alpha0, d8 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dgemm_kernel_L2_BEGIN + +dgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + + lsl temp, origK, #5 // k * 4 * 8 + mov pA, origPA // pA = start of A array + add ppA, temp, pA + +//------------------------------------------------------------------------------ + +dgemm_kernel_L4_M8_BEGIN: + + mov counterI, origM + asr counterI, counterI, #3 // counterI = counterI / 8 + cmp counterI, #0 + ble dgemm_kernel_L4_M4_BEGIN + +dgemm_kernel_L4_M8_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dgemm_kernel_L4_M8_32 + + KERNEL8x4_I // do one in the K + KERNEL8x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble dgemm_kernel_L4_M8_22a + .align 5 + +dgemm_kernel_L4_M8_22: + + KERNEL8x4_M1 + KERNEL8x4_M2 + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M8_22 + + +dgemm_kernel_L4_M8_22a: + + KERNEL8x4_M1 + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + +dgemm_kernel_L4_M8_32: + + tst counterL, #1 + ble dgemm_kernel_L4_M8_40 + + KERNEL8x4_I + + KERNEL8x4_E + + b dgemm_kernel_L4_M8_44 + + +dgemm_kernel_L4_M8_40: + + INIT8x4 + +dgemm_kernel_L4_M8_44: + + ands counterL , origK, #1 + ble dgemm_kernel_L4_M8_100 + +dgemm_kernel_L4_M8_46: + + KERNEL8x4_SUB + +dgemm_kernel_L4_M8_100: + + SAVE8x4 + +dgemm_kernel_L4_M8_END: + lsl temp, origK, #5 // k * 4 * 8 + add pA, pA, temp + add ppA, ppA, temp + subs counterI, counterI, #1 + bne dgemm_kernel_L4_M8_20 + + +dgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble dgemm_kernel_L4_END + + tst counterI, #4 + ble dgemm_kernel_L4_M2_BEGIN + +dgemm_kernel_L4_M4_20: + + INIT4x4 + + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L4_M4_40 + +dgemm_kernel_L4_M4_22: + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_22 + + +dgemm_kernel_L4_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M4_100 + +dgemm_kernel_L4_M4_42: + + KERNEL4x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M4_42 + +dgemm_kernel_L4_M4_100: + + SAVE4x4 + +dgemm_kernel_L4_M4_END: + + +dgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L4_M1_BEGIN + +dgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M2_40 + +dgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_22 + + +dgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M2_100 + +dgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M2_42 + +dgemm_kernel_L4_M2_100: + + SAVE2x4 + +dgemm_kernel_L4_M2_END: + + +dgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L4_END + +dgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L4_M1_40 + +dgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_22 + + +dgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L4_M1_100 + +dgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L4_M1_42 + +dgemm_kernel_L4_M1_100: + + SAVE1x4 + + +dgemm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt dgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dgemm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +dgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dgemm_kernel_L2_M2_BEGIN + +dgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M4_40 + .align 5 + +dgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_22 + + +dgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M4_100 + +dgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M4_42 + +dgemm_kernel_L2_M4_100: + + SAVE4x2 + +dgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L2_M4_20 + + +dgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L2_M1_BEGIN + +dgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dgemm_kernel_L2_M2_40 + +dgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_22 + + +dgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M2_100 + +dgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M2_42 + +dgemm_kernel_L2_M2_100: + + SAVE2x2 + +dgemm_kernel_L2_M2_END: + + +dgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L2_END + +dgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dgemm_kernel_L2_M1_40 + +dgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_22 + + +dgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L2_M1_100 + +dgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L2_M1_42 + +dgemm_kernel_L2_M1_100: + + SAVE1x2 + + +dgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // update pC to point to next + + mov pA, origPA // pA = A + + + +dgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dgemm_kernel_L1_M2_BEGIN + +dgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M4_40 + .align 5 + +dgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_22 + + +dgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M4_100 + +dgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M4_42 + +dgemm_kernel_L1_M4_100: + + SAVE4x1 + +dgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dgemm_kernel_L1_M4_20 + + +dgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dgemm_kernel_L1_M1_BEGIN + +dgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M2_40 + +dgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_22 + + +dgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M2_100 + +dgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M2_42 + +dgemm_kernel_L1_M2_100: + + SAVE2x1 + +dgemm_kernel_L1_M2_END: + + +dgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dgemm_kernel_L1_END + +dgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dgemm_kernel_L1_M1_40 + +dgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_22 + + +dgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble dgemm_kernel_L1_M1_100 + +dgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dgemm_kernel_L1_M1_42 + +dgemm_kernel_L1_M1_100: + + SAVE1x1 + + +dgemm_kernel_L1_END: + + +dgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/dnrm2.S b/kernel/arm64/dnrm2.S new file mode 100644 index 000000000..3dec99efd --- /dev/null +++ b/kernel/arm64/dnrm2.S @@ -0,0 +1,169 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define TMPF d6 +#define SSQ d0 +#define TMPVF {v6.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +.macro KERNEL_F8 + ld1 {v1.2d, v2.2d}, [X], #32 + fmla v0.2d, v1.2d, v1.2d + fmla v5.2d, v2.2d, v2.2d + ld1 {v3.2d, v4.2d}, [X], #32 + fmla v0.2d, v3.2d, v3.2d + fmla v5.2d, v4.2d, v4.2d + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro nrm2_kernel_F8_FINALIZE + fadd v0.2d, v0.2d, v5.2d + faddp SSQ, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #3 + ld1 TMPVF, [X], INC_X + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SSQ, xzr + fmov d5, SSQ + + cmp N, xzr + ble nrm2_kernel_zero + cmp INC_X, xzr + ble nrm2_kernel_zero + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq nrm2_kernel_F1_INIT + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + + nrm2_kernel_F8_FINALIZE + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_F1_INIT: + + b nrm2_kernel_F1 + +nrm2_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble nrm2_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble nrm2_kernel_S1 + +nrm2_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S4 + +nrm2_kernel_S1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + ret + +nrm2_kernel_zero: + ret + + EPILOGUE diff --git a/kernel/arm64/dot.S b/kernel/arm64/dot.S new file mode 100644 index 000000000..35d47790c --- /dev/null +++ b/kernel/arm64/dot.S @@ -0,0 +1,227 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#if !defined(DSDOT) +#define REG0 wzr +#define DOTF s0 +#else // DSDOT +#define REG0 xzr +#define DOTF d0 +#endif +#define DOTI s1 +#define TMPX s2 +#define LD1VX {v2.s}[0] +#define TMPY s3 +#define LD1VY {v3.s}[0] +#define TMPVY v3.s[0] +#define SZ 4 +#else +#define REG0 xzr +#define DOTF d0 +#define DOTI d1 +#define TMPX d2 +#define LD1VX {v2.d}[0] +#define TMPY d3 +#define LD1VY {v3.d}[0] +#define TMPVY v3.d[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + ldr TMPX, [X], #SZ + ldr TMPY, [Y], #SZ +#if !defined(DSDOT) + fmadd DOTF, TMPX, TMPY, DOTF +#else // DSDOT + fmul TMPX, TMPX, TMPY + fcvt d2, TMPX + fadd DOTF, DOTF, d2 +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [X], #16 + ld1 {v3.4s}, [Y], #16 +#if !defined(DSDOT) + fmla v0.4s, v2.4s, v3.4s +#else + fmul v2.4s, v2.4s, v3.4s + ext v3.16b, v2.16b, v2.16b, #8 + fcvtl v2.2d, v2.2s + fcvtl v3.2d, v3.2s + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v3.2d +#endif +#else //DOUBLE + ld1 {v2.2d, v3.2d}, [X], #32 + ld1 {v4.2d, v5.2d}, [Y], #32 + fmul v2.2d, v2.2d, v4.2d + fmul v3.2d, v3.2d, v5.2d + fadd v0.2d, v0.2d, v2.2d + fadd v0.2d, v0.2d, v3.2d +#endif + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) +#if !defined(DSDOT) + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp DOTF, v0.2s +#else + faddp DOTF, v0.2d +#endif +#else //DOUBLE + faddp DOTF, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +.endm + +.macro KERNEL_S1 + ld1 LD1VX, [X], INC_X + ld1 LD1VY, [Y], INC_Y +#if !defined(DSDOT) + fmadd DOTF, TMPX, TMPY, DOTF +#else // DSDOT + fmul TMPX, TMPX, TMPY + fcvt d2, TMPX + fadd DOTF, DOTF, d2 +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov DOTF, REG0 +#if defined(DOUBLE) + fmov d6, DOTF +#endif + + cmp N, xzr + ble dot_kernel_L999 + + cmp INC_X, #1 + bne dot_kernel_S_BEGIN + cmp INC_Y, #1 + bne dot_kernel_S_BEGIN + +dot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq dot_kernel_F1 + +dot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne dot_kernel_F4 + + KERNEL_F4_FINALIZE + +dot_kernel_F1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne dot_kernel_F10 + + ret + +dot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble dot_kernel_S1 + +dot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S4 + +dot_kernel_S1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S10 + +dot_kernel_L999: + + ret + + EPILOGUE diff --git a/kernel/arm64/dtrmm_kernel_4x4.S b/kernel/arm64/dtrmm_kernel_4x4.S new file mode 100644 index 000000000..0d1b12881 --- /dev/null +++ b/kernel/arm64/dtrmm_kernel_4x4.S @@ -0,0 +1,1398 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7*/ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 d10 +#define alphaV0 v10.d[0] +#define alpha1 d11 +#define alphaV1 v11.d[0] +#define alpha2 d14 +#define alphaV2 v14.d[0] +#define alpha3 d15 +#define alphaV3 v15.d[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 +//v03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 +//v07 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 +//v19 +//v20 C10, C11 +//v21 C12, C13 +//v22 +//v23 +//v24 C20, C21 +//v25 C22, C23 +//v26 +//v27 +//v28 C30, C31 +//v29 C32, C33 +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 + fmov d24, d17 + fmov d25, d16 + fmov d28, d17 + fmov d29, d16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + fmul v29.2d, v1.2d, v9.2d[1] + + fmul v20.2d, v0.2d, v8.2d[1] + fmul v25.2d, v1.2d, v9.2d[0] + + fmul v24.2d, v0.2d, v9.2d[0] + fmul v21.2d, v1.2d, v8.2d[1] + + fmul v28.2d, v0.2d, v9.2d[1] + fmul v17.2d, v1.2d, v8.2d[0] + + ld1 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld1 {v4.2d, v5.2d}, [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + ld1 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + ld1 {v4.2d, v5.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + ld1 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + ld1 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2d, v4.2d, v12.2d[0] + fmla v29.2d, v5.2d, v13.2d[1] + + fmla v20.2d, v4.2d, v12.2d[1] + fmla v25.2d, v5.2d, v13.2d[0] + + fmla v24.2d, v4.2d, v13.2d[0] + fmla v21.2d, v5.2d, v12.2d[1] + + fmla v28.2d, v4.2d, v13.2d[1] + fmla v17.2d, v5.2d, v12.2d[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v29.2d, v1.2d, v9.2d[1] + + fmla v20.2d, v0.2d, v8.2d[1] + fmla v25.2d, v1.2d, v9.2d[0] + + fmla v24.2d, v0.2d, v9.2d[0] + fmla v21.2d, v1.2d, v8.2d[1] + + fmla v28.2d, v0.2d, v9.2d[1] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x4 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV0 + fmul v9.2d, v25.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV2 + fmul v13.2d, v29.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d20, d16 + fmov d24, d20 + fmov d28, d16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v24.2d, v0.2d, v9.2d[0] + fmla v28.2d, v0.2d, v9.2d[1] +.endm + +.macro SAVE2x4 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow2, pCRow1, LDC + + fmul v8.2d, v24.2d, alphaV2 + st1 {v8.2d}, [pCRow2] + + add pCRow1, pCRow2, LDC + + fmul v12.2d, v28.2d, alphaV3 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL1x4_SUB + ldr d0, [pA] + add pA, pA, #8 + + ld1 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + + fmla v16.2d, v8.2d, v0.d[0] + fmla v20.2d, v9.2d, v0.d[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.d}[0], [pCRow2] + st1 {v12.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, d16 + fmov d20, d17 + fmov d21, d16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + ld1 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] + fmla v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow1, pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV2 + fmul v13.2d, v21.2d, alphaV3 + st1 {v12.2d, v13.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d20, d16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2d}, [pB] + add pB, pB, #16 + + ld1 {v0.2d}, [pA] + add pA, pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v20.2d, v0.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow1 , pCRow0, LDC + + fmul v12.2d, v20.2d, alphaV1 + st1 {v12.2d}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2d} , [pB] + add pB , pB, #16 + + ldr d0 , [pA] + add pA, pA, #8 + + fmla v16.2d, v8.2d, v0.2d[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.d}[0], [pCRow0] + st1 {v8.d}[1], [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 +.endm + +.macro KERNEL4x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d, v1.2d}, [pA] + add pA , pA, #32 + + fmla v16.2d, v0.2d, v8.2d[0] + fmla v17.2d, v1.2d, v8.2d[0] +.endm + +.macro SAVE4x1 + fmul v8.2d, v16.2d, alphaV0 + fmul v9.2d, v17.2d, alphaV1 + st1 {v8.2d, v9.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr +.endm + +.macro KERNEL2x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ld1 {v0.2d}, [pA] + add pA , pA, #16 + + fmla v16.2d, v0.2d, v8.2d[0] +.endm + +.macro SAVE2x1 + fmul v8.2d, v16.2d, alphaV0 + st1 {v8.2d}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr +.endm + +.macro KERNEL1x1_SUB + ldr d8, [pB] + add pB , pB, #8 + + ldr d0, [pA] + add pA , pA, #8 + + fmadd d16, d0, d8, d16 +.endm + +.macro SAVE1x1 + fmul d8, d16, alpha0 + str d8, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, d0 + fmov alpha1, d0 + fmov alpha2, d0 + fmov alpha3, d0 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble dtrmm_kernel_L2_BEGIN + +/******************************************************************************/ + +dtrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +dtrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L4_M2_BEGIN + +dtrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt dtrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble dtrmm_kernel_L4_M4_22a + .align 5 + +dtrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M4_22 + + +dtrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + +dtrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble dtrmm_kernel_L4_M4_40 + + KERNEL4x4_I + + KERNEL4x4_E + + b dtrmm_kernel_L4_M4_44 + + +dtrmm_kernel_L4_M4_40: + + INIT4x4 + +dtrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble dtrmm_kernel_L4_M4_100 + +dtrmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +dtrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne dtrmm_kernel_L4_M4_20 + +dtrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L4_M1_BEGIN + +dtrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pA, pA, temp + lsl temp, tempOffset, #5 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M2_40 + +dtrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_22 + + +dtrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M2_100 + +dtrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M2_42 + +dtrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L4_M2_END: + + +dtrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L4_END + +dtrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L4_M1_40 + +dtrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_22 + + +dtrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L4_M1_100 + +dtrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L4_M1_42 + +dtrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L4_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt dtrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble dtrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble dtrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + +dtrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble dtrmm_kernel_L2_M2_BEGIN + +dtrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M4_40 + .align 5 + +dtrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_22 + + +dtrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M4_100 + +dtrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M4_42 + +dtrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L2_M4_20 + + +dtrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L2_M1_BEGIN + +dtrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble dtrmm_kernel_L2_M2_40 + +dtrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_22 + + +dtrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M2_100 + +dtrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M2_42 + +dtrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L2_M2_END: + + +dtrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L2_END + +dtrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble dtrmm_kernel_L2_M1_40 + +dtrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_22 + + +dtrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L2_M1_100 + +dtrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L2_M1_42 + +dtrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +dtrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ + +dtrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble dtrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +dtrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble dtrmm_kernel_L1_M2_BEGIN + +dtrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M4_40 + .align 5 + +dtrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_22 + + +dtrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M4_100 + +dtrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M4_42 + +dtrmm_kernel_L1_M4_100: + + SAVE4x1 + + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +dtrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt dtrmm_kernel_L1_M4_20 + + +dtrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble dtrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble dtrmm_kernel_L1_M1_BEGIN + +dtrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M2_40 + +dtrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_22 + + +dtrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M2_100 + +dtrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M2_42 + +dtrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +dtrmm_kernel_L1_M2_END: + + +dtrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble dtrmm_kernel_L1_END + +dtrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble dtrmm_kernel_L1_M1_40 + +dtrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_22 + + +dtrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble dtrmm_kernel_L1_M1_100 + +dtrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt dtrmm_kernel_L1_M1_42 + +dtrmm_kernel_L1_M1_100: + + SAVE1x1 + + +dtrmm_kernel_L1_END: + + +dtrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S new file mode 100644 index 000000000..6279c2250 --- /dev/null +++ b/kernel/arm64/gemv_n.S @@ -0,0 +1,320 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA s0 +#define TEMP s1 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define ALPHA d0 +#define TEMP d1 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F16 +#if !defined(DOUBLE) + ld1 {v2.4s, v3.4s}, [A_PTR], #32 + ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 + fmla v4.4s, v1.4s, v2.4s + fmla v5.4s, v1.4s, v3.4s + st1 {v4.4s, v5.4s}, [Y_OPTR], #32 + + ld1 {v6.4s, v7.4s}, [A_PTR], #32 + ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 + fmla v8.4s, v1.4s, v6.4s + fmla v9.4s, v1.4s, v7.4s + st1 {v8.4s, v9.4s}, [Y_OPTR], #32 +#else //DOUBLE + ld1 {v2.2d, v3.2d}, [A_PTR], #32 + ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 + fmla v4.2d, v1.2d, v2.2d + fmla v5.2d, v1.2d, v3.2d + st1 {v4.2d, v5.2d}, [Y_OPTR], #32 + + ld1 {v6.2d, v7.2d}, [A_PTR], #32 + ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 + fmla v8.2d, v1.2d, v6.2d + fmla v9.2d, v1.2d, v7.2d + st1 {v8.2d, v9.2d}, [Y_OPTR], #32 + + ld1 {v10.2d, v11.2d}, [A_PTR], #32 + ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 + fmla v12.2d, v1.2d, v10.2d + fmla v13.2d, v1.2d, v11.2d + st1 {v12.2d, v13.2d}, [Y_OPTR], #32 + + ld1 {v14.2d, v15.2d}, [A_PTR], #32 + ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 + fmla v16.2d, v1.2d, v14.2d + fmla v17.2d, v1.2d, v15.2d + st1 {v16.2d, v17.2d}, [Y_OPTR], #32 +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [Y_IPTR], #16 + fmla v3.4s, v1.4s, v2.4s + st1 {v3.4s}, [Y_OPTR], #16 +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [Y_IPTR], #16 + fmla v3.2d, v1.2d, v2.2d + st1 {v3.2d}, [Y_OPTR], #16 + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + fmla v5.2d, v1.2d, v4.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro KERNEL_F1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], #SZ + +.endm + +.macro INIT_S + + lsl INC_Y, INC_Y, #SHZ + +.endm + +.macro KERNEL_S1 + + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [Y_IPTR] + fmadd TMP2, TEMP, TMP1, TMP2 + st1 TMPV2, [Y_IPTR], INC_Y + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_n_kernel_L999 + cmp M, xzr + ble gemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + cmp INC_Y, #1 + bne gemv_n_kernel_S_BEGIN + +gemv_n_kernel_F_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP +#if !defined(DOUBLE) + ins v1.s[1], v1.s[0] + ins v1.s[2], v1.s[0] + ins v1.s[3], v1.s[0] +#else + ins v1.d[1], v1.d[0] +#endif + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + +gemv_n_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_n_kernel_F4 + +gemv_n_kernel_F320: + + KERNEL_F16 + KERNEL_F16 + + subs I, I, #1 + bne gemv_n_kernel_F320 + +gemv_n_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_n_kernel_F1 + +gemv_n_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_n_kernel_F40 + +gemv_n_kernel_F1: + ands I, M, #3 + ble gemv_n_kernel_F_END + +gemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_n_kernel_F10 + +gemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_F_LOOP + + b gemv_n_kernel_L999 + +gemv_n_kernel_S_BEGIN: + + INIT_S + +gemv_n_kernel_S_LOOP: + + ld1 TEMPV, [X], INC_X + fmul TEMP, ALPHA, TEMP + mov A_PTR, A + mov Y_IPTR, Y + + asr I, M, #2 + cmp I, xzr + ble gemv_n_kernel_S1 + +gemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S4 + +gemv_n_kernel_S1: + + ands I, M, #3 + ble gemv_n_kernel_S_END + +gemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_n_kernel_S10 + +gemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne gemv_n_kernel_S_LOOP + +gemv_n_kernel_L999: + + mov w0, wzr + + RESTORE_REGS + + ret + + EPILOGUE diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S new file mode 100644 index 000000000..0145af621 --- /dev/null +++ b/kernel/arm64/gemv_t.S @@ -0,0 +1,347 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop X vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define REG0 wzr +#define ALPHA s0 +#define TEMP s1 +#define TEMP1 s2 +#define TEMP2 s3 +#define TEMP3 s4 +#define TEMPV {v1.s}[0] +#define TMP1 s2 +#define TMPV1 {v2.s}[0] +#define TMP2 s3 +#define TMPV2 {v3.s}[0] +#define SZ 4 +#define SHZ 2 +#else +#define REG0 xzr +#define ALPHA d0 +#define TEMP d1 +#define TEMP1 d2 +#define TEMP2 d3 +#define TEMP3 d4 +#define TEMPV {v1.d}[0] +#define TMP1 d2 +#define TMPV1 {v2.d}[0] +#define TMP2 d3 +#define TMPV2 {v3.d}[0] +#define SZ 8 +#define SHZ 3 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro KERNEL_F32 +#if !defined(DOUBLE) + ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 + ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 + fmla v1.4s, v5.4s, v9.4s + fmla v2.4s, v6.4s, v10.4s + fmla v3.4s, v7.4s, v11.4s + fmla v4.4s, v8.4s, v12.4s + + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 + ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 + fmla v1.4s, v13.4s, v17.4s + fmla v2.4s, v14.4s, v18.4s + fmla v3.4s, v15.4s, v19.4s + fmla v4.4s, v16.4s, v20.4s +#else + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d + + ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 + ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 + fmla v1.2d, v5.2d, v9.2d + fmla v2.2d, v6.2d, v10.2d + fmla v3.2d, v7.2d, v11.2d + fmla v4.2d, v8.2d, v12.2d + + ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 + ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 + fmla v1.2d, v13.2d, v17.2d + fmla v2.2d, v14.2d, v18.2d + fmla v3.2d, v15.2d, v19.2d + fmla v4.2d, v16.2d, v20.2d +#endif +.endm + +.macro KERNEL_F32_FINALIZE +#if !defined(DOUBLE) + fadd v1.4s, v1.4s, v2.4s + fadd v1.4s, v1.4s, v3.4s + fadd v1.4s, v1.4s, v4.4s +#else + fadd v1.2d, v1.2d, v2.2d + fadd v1.2d, v1.2d, v3.2d + fadd v1.2d, v1.2d, v4.2d +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [A_PTR], #16 + ld1 {v3.4s}, [X_PTR], #16 + fmla v1.4s, v2.4s, v3.4s +#else + ld1 {v2.2d}, [A_PTR], #16 + ld1 {v3.2d}, [X_PTR], #16 + fmla v1.2d, v2.2d, v3.2d + + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [X_PTR], #16 + fmla v1.2d, v4.2d, v5.2d +#endif +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v2.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v2.2s + faddp TEMP, v1.2s +#else + faddp TEMP, v1.2d +#endif +.endm + +.macro KERNEL_F1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], #SZ + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 + ld1 TMPV1, [A_PTR], #SZ + ld1 TMPV2, [X_PTR], INC_X + fmadd TEMP, TMP1, TMP2, TEMP +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble gemv_t_kernel_L999 + cmp M, xzr + ble gemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + cmp INC_X, #1 + bne gemv_t_kernel_S_BEGIN + +gemv_t_kernel_F_LOOP: + + fmov TEMP, REG0 + fmov TEMP1, REG0 + fmov TEMP2, REG0 + fmov TEMP3, REG0 + + mov A_PTR, A + mov X_PTR, X + +gemv_t_kernel_F32: + + asr I, M, #5 + cmp I, xzr + beq gemv_t_kernel_F4 + +gemv_t_kernel_F320: + + KERNEL_F32 + + subs I, I, #1 + bne gemv_t_kernel_F320 + + KERNEL_F32_FINALIZE + +gemv_t_kernel_F4: + ands I, M, #31 + asr I, I, #2 + cmp I, xzr + beq gemv_t_kernel_F1 + +gemv_t_kernel_F40: + + KERNEL_F4 + + subs I, I, #1 + bne gemv_t_kernel_F40 + +gemv_t_kernel_F1: + + KERNEL_F4_FINALIZE + + ands I, M, #3 + ble gemv_t_kernel_F_END + +gemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne gemv_t_kernel_F10 + +gemv_t_kernel_F_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_F_LOOP + + b gemv_t_kernel_L999 + +gemv_t_kernel_S_BEGIN: + + INIT_S + +gemv_t_kernel_S_LOOP: + + fmov TEMP, REG0 + mov A_PTR, A + mov X_PTR, X + + asr I, M, #2 + cmp I, xzr + ble gemv_t_kernel_S1 + +gemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S4 + +gemv_t_kernel_S1: + + ands I, M, #3 + ble gemv_t_kernel_S_END + +gemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne gemv_t_kernel_S10 + +gemv_t_kernel_S_END: + + ld1 TMPV1, [Y] + add A, A, LDA + subs J, J, #1 + fmadd TMP1, ALPHA, TEMP, TMP1 + st1 TMPV1, [Y], INC_Y + bne gemv_t_kernel_S_LOOP + +gemv_t_kernel_L999: + + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/idamax.S b/kernel/arm64/idamax.S new file mode 100644 index 000000000..fd4265899 --- /dev/null +++ b/kernel/arm64/idamax.S @@ -0,0 +1,124 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro INIT_S + lsl INC_X, INC_X, #3 + ld1 {v0.d}[0], [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs MAXF, MAXF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + add Z, Z, #1 + fabs TMPF, TMPF + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/isamax.S b/kernel/arm64/isamax.S new file mode 100644 index 000000000..309b1c1a4 --- /dev/null +++ b/kernel/arm64/isamax.S @@ -0,0 +1,213 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ +#define X_COPY x6 /* copy of X address */ +#define MAXF_Z x7 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define MAXF s5 +#define TMPF s6 +#define TMPVF {v6.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro INIT_F1 + ldr MAXF, [X], #SZ + mov Z, #1 + mov INDEX, Z + fabs MAXF, MAXF +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + add Z, Z, #1 + fabs TMPF, TMPF + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel INDEX, INDEX, Z, le +.endm + +.macro INIT_F4 + ld1 {v0.4s}, [X], #16 + fabs v0.4s, v0.4s + fmaxv MAXF, v0.4s + mov Z, #5 + mov MAXF_Z, #1 +.endm + +.macro KERNEL_F4 + ld1 {v0.4s}, [X], #16 + fabs v0.4s, v0.4s + fmaxv TMPF, v0.4s + PRFM PLDL1KEEP, [X, #512] + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel MAXF_Z, MAXF_Z, Z, le + add Z, Z, #4 +.endm + + +.macro KERNEL_F4_FINALIZE + mov INDEX, MAXF_Z + sub MAXF_Z, MAXF_Z, #1 + lsl MAXF_Z, MAXF_Z, #2 + add X_COPY, X_COPY, MAXF_Z + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 + ldr TMPF, [X_COPY], #SZ + fabs TMPF, TMPF + fcmp TMPF, MAXF + beq KERNEL_F4_FINALIZE_DONE + add INDEX, INDEX, #1 +KERNEL_F4_FINALIZE_DONE: +.endm + + +.macro INIT_S + lsl INC_X, INC_X, #2 + ld1 TMPVF, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs MAXF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + add Z, Z, #1 + fabs TMPF, TMPF + fcmp TMPF, MAXF + fcsel MAXF, MAXF, TMPF, le + csel INDEX, INDEX, Z, le +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + PRFM PLDL1KEEP, [X] + mov X_COPY, X + + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + +iamax_kernel_F_BEGIN: + asr I, N, #2 + cmp I, xzr + beq iamax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq iamax_kernel_F4_FINALIZE + +iamax_kernel_F4: + KERNEL_F4 + subs I, I, #1 + bne iamax_kernel_F4 + +iamax_kernel_F4_FINALIZE: + KERNEL_F4_FINALIZE + +iamax_kernel_F1: + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_F10: + KERNEL_F1 + subs I, I, #1 + bne iamax_kernel_F10 + b iamax_kernel_L999 + +iamax_kernel_F1_INIT: + INIT_F1 + subs N, N, #1 + b iamax_kernel_F1 + +iamax_kernel_S_BEGIN: + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + KERNEL_S1 + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + mov x0, INDEX + ret + +iamax_kernel_zero: + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S new file mode 100644 index 000000000..ebdc671e0 --- /dev/null +++ b/kernel/arm64/izamax.S @@ -0,0 +1,151 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define INDEX x3 /* index of max/min value */ +#define Z x4 /* vector index */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v0.2s}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + lsl INC_X, INC_X, #4 + ld1 {v0.2d}, [X], INC_X + mov Z, #1 + mov INDEX, Z + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + add Z, Z, #1 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], INC_X + add Z, Z, #1 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble iamax_kernel_zero + cmp INC_X, xzr + ble iamax_kernel_zero + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble iamax_kernel_S1 + +iamax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S4 + +iamax_kernel_S1: + + ands I, N, #3 + ble iamax_kernel_L999 + +iamax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_S10 + +iamax_kernel_L999: + + mov x0, INDEX + ret + +iamax_kernel_zero: + + mov x0, xzr + ret + + EPILOGUE diff --git a/kernel/arm64/rot.S b/kernel/arm64/rot.S new file mode 100644 index 000000000..ea48b5cb3 --- /dev/null +++ b/kernel/arm64/rot.S @@ -0,0 +1,243 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define C s0 /* scale input value */ +#define S s1 /* scale input value */ +#else +#define C d0 /* scale input value */ +#define S d1 /* scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // [C, C] +#else + ins v0.d[1], v0.d[0] // [C, C] +#endif +.endm + +.macro INIT_F1 +#if !defined(DOUBLE) + fneg s2, S + ins v1.s[1], v2.s[0] // [-S, S] +#else + fneg d2, S + ins v1.d[1], v2.d[0] // [-S, S] +#endif +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v2.s}[0], [X] + ld1 {v2.s}[1], [Y] // [Y, X] + ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] + fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] + fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] + st1 {v4.s}[0], [X], #4 + st1 {v4.s}[1], [Y], #4 +#else + ld1 {v2.d}[0], [X] + ld1 {v2.d}[1], [Y] // [Y, X] + ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] + fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] + fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] + st1 {v4.d}[0], [X], #8 + st1 {v4.d}[1], [Y], #8 +#endif +.endm + +.macro KERNEL_INIT_F4 +#if !defined(DOUBLE) + ins v0.d[1], v0.d[0] // [C, C, C, C] + ins v1.s[1], v1.s[0] + ins v1.d[1], v1.d[0] // [S, S, S, S] +#else + ins v1.d[1], v1.d[0] // [S, S] +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld1 {v2.4s}, [X] + fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 + ld1 {v3.4s}, [Y] + fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0 + st1 {v4.4s}, [X], #16 + fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0 + fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 + st1 {v5.4s}, [Y], #16 +#else // DOUBLE + ld1 {v2.2d, v3.2d}, [X] + fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2 + ld1 {v4.2d, v5.2d}, [Y] + fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2 + st1 {v6.2d, v7.2d}, [X], #32 + fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2 + fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2 + st1 {v16.2d, v17.2d}, [Y], #32 + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v2.s}[0], [X] + ld1 {v2.s}[1], [Y] // [Y, X] + ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] + fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] + fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] + st1 {v4.s}[0], [X], INC_X + st1 {v4.s}[1], [Y], INC_Y +#else + ld1 {v2.d}[0], [X] + ld1 {v2.d}[1], [Y] // [Y, X] + ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] + fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] + fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] + st1 {v4.d}[0], [X], INC_X + st1 {v4.d}[1], [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble rot_kernel_L999 + + INIT + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + +rot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq rot_kernel_F1 + + KERNEL_INIT_F4 + +rot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + + INIT_F1 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + mov w0, wzr + ret + +rot_kernel_S_BEGIN: + + INIT_S + INIT_F1 + + + asr I, N, #2 + cmp I, xzr + ble rot_kernel_S1 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + +rot_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/scal.S b/kernel/arm64/scal.S new file mode 100644 index 000000000..91d469d03 --- /dev/null +++ b/kernel/arm64/scal.S @@ -0,0 +1,253 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define X_COPY x5 /* X vector address */ +#define INC_X x4 /* X stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA s0 /* scale input value */ +#define DAV {v0.s}[0] +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define DA d0 /* scale input value */ +#define DAV {v0.d}[0] +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + + ldr TMPF, [X] + fmul TMPF, TMPF, DA + str TMPF, [X], #SZ + +.endm + +.macro KERNEL_INIT_F8 + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] + ins v0.s[2], v0.s[0] + ins v0.s[3], v0.s[0] +#else + ins v0.d[1], v0.d[0] +#endif + +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X] + fmul v1.4s, v1.4s, v0.4s + fmul v2.4s, v2.4s, v0.4s + st1 {v1.4s, v2.4s}, [X], #32 +#else // DOUBLE + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X] + fmul v1.2d, v1.2d, v0.2d + fmul v2.2d, v2.2d, v0.2d + fmul v3.2d, v3.2d, v0.2d + fmul v4.2d, v4.2d, v0.2d + st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 +#else + lsl INC_X, INC_X, #3 +#endif + +.endm + +.macro KERNEL_S1 + ldr TMPF, [X] + fmul TMPF, TMPF, DA + st1 TMPVF, [X], INC_X +.endm + +.macro KERNEL_S4 +#if !defined(DOUBLE) + ldr s1, [X] + add X, X, INC_X + fmul s1, s1, s0 + str s1, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s2, [X] + add X, X, INC_X + fmul s2, s2, s0 + str s2, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s3, [X] + add X, X, INC_X + fmul s3, s3, s0 + str s3, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr s4, [X] + add X, X, INC_X + fmul s4, s4, s0 + str s4, [X_COPY] + add X_COPY, X_COPY, INC_X +#else + ldr d1, [X] + add X, X, INC_X + fmul d1, d1, d0 + str d1, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d2, [X] + add X, X, INC_X + fmul d2, d2, d0 + str d2, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d3, [X] + add X, X, INC_X + fmul d3, d3, d0 + str d3, [X_COPY] + add X_COPY, X_COPY, INC_X + + ldr d4, [X] + add X, X, INC_X + fmul d4, d4, d0 + str d4, [X_COPY] + add X_COPY, X_COPY, INC_X +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble scal_kernel_L999 + + fcmp DA, #0.0 + beq scal_kernel_zero + + cmp INC_X, #1 + bne scal_kernel_S_BEGIN + +scal_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq scal_kernel_F1 + + KERNEL_INIT_F8 + +scal_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne scal_kernel_F8 + +scal_kernel_F1: + + ands I, N, #7 + ble scal_kernel_L999 + +scal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne scal_kernel_F10 + + mov w0, wzr + ret + +scal_kernel_S_BEGIN: + + INIT_S + mov X_COPY, X + + asr I, N, #2 + cmp I, xzr + ble scal_kernel_S1 + +scal_kernel_S4: + + KERNEL_S4 + + subs I, I, #1 + bne scal_kernel_S4 + +scal_kernel_S1: + + ands I, N, #3 + ble scal_kernel_L999 + +scal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne scal_kernel_S10 + +scal_kernel_L999: + + mov w0, wzr + ret + +scal_kernel_zero: + + INIT_S + +scal_kernel_Z1: + + st1 DAV, [X], INC_X + subs N, N, #1 + bne scal_kernel_Z1 + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S index 78633297f..bfa80d589 100644 --- a/kernel/arm64/sgemm_kernel_4x4.S +++ b/kernel/arm64/sgemm_kernel_4x4.S @@ -1,5 +1,5 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,57 +23,43 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/23 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -* -* 2013/11/02 Saar -* UNROLL_N 4 -* UNROLL_M 4 -* DGEMM_P 128 -* DGEMM_Q 240 -* DGEMM_R 12288 -* A_PRE 128 -* B_PRE 128 -* C_PRE 32 -* -* Performance on Odroid U2: -* -* 3072x3072 1 Core: 2.62 GFLOPS ATLAS: 2.69 GFLOPS -* 3072x3072 2 Cores: 5.23 GFLOPS ATLAS: 5.27 GFLOPS -* 3072x3072 3 Cores: 7.78 GFLOPS ATLAS: 7.87 GFLOPS -* 3072x3072 4 Cores: 10.10 GFLOPS ATLAS: 9.98 GFLOPS -**************************************************************************************/ +*******************************************************************************/ #define ASSEMBLER #include "common.h" -/* X0 X1 X2 s0 X3 x4 x5 x6*/ -/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/ +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA_0 x15 +#define pA_1 x16 +#define pA_2 x17 +#define pA_3 x18 -#define origM x0 -#define origN x1 -#define origK x2 -#define origPA x3 -#define origPB x4 -#define pC x5 -#define LDC x6 -#define offset x7 -#define counterL x8 -#define counterI x9 -#define pB x10 -#define counterJ x11 -#define tempALPHA x12 -#define pCRow0 x13 -#define pCRow1 x14 -#define pCRow2 x15 -#define pA x16 +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] // 00 origM // 01 origN @@ -82,18 +68,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI -// 10 pB -// 11 counterJ -// 12 tempALPHA -// 13 pCRow0 -// 14 pCRow1 -// 15 pCRow2 -// 16 pA -// 17 -// 18 must save +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA_0 +// 16 pA_1 +// 17 pA_2 +// 18 must save pA_3 // 19 must save // 20 must save // 21 must save @@ -108,558 +94,719 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 30 link // 31 sp -//v00 orig ALPHA -> a00 -//v01 a01 -//v02 a02 -//v03 a03 -//v04 a10 -//v05 a11 -//v06 a12 -//v07 a13 -//v08 must save b00 -//v09 must save b01 -//v10 must save b02 -//v11 must save b03 -//v12 must save b10 -//v13 must save b11 -//v14 must save b12 -//v15 must save b13 -//v16 must save C00 -//v17 must save C01 -//v18 C02 -//v19 C03 -//v20 C10 -//v21 C11 -//v22 C12 -//v23 C13 -//v24 C20 -//v25 C21 -//v26 C22 -//v27 C23 -//v28 C30 -//v29 C31 -//v30 C32 -//v31 C33 +/***************************** FOR 16x4 ***************************************/ +//v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0 +//v01 pA10_0, pA11_0, pA12_0, pA13_0 +//v02 pA00_1, pA01_1, pA02_1, pA03_1 +//v03 pA10_1, pA11_1, pA12_1, pA13_1 +//v04 pA00_2, pA01_2, pA02_2, pA03_2 +//v05 pA10_2, pA11_2, pA12_2, pA13_2 +//v06 pA00_3, pA01_3, pA02_3, pA03_3 +//v07 pA10_3, pA11_3, pA12_3, pA13_3 +//v08 must save pB00, pB01, pB02, pB03 +//v09 must save +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11, pB12, pB13 +//v13 must save +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00_0, C01_0, C02_0, C03_0 +//v17 must save C10_0, C11_0, C12_0, C13_0 +//v18 C20_0, C21_0, C22_0, C23_0 +//v19 C30_0, C31_0, C32_0, C33_0 +//v20 C00_1, C01_1, C02_1, C03_1 +//v21 C10_1, C11_1, C12_1, C13_1 +//v22 C20_1, C21_1, C22_1, C23_1 +//v23 C30_1, C31_1, C32_1, C33_1 +//v24 C00_2, C01_2, C02_2, C03_2 +//v25 C10_2, C11_2, C12_2, C13_2 +//v26 C20_2, C21_2, C22_2, C23_2 +//v27 C30_2, C31_2, C32_2, C33_2 +//v28 C00_3, C01_3, C02_3, C03_3 +//v29 C10_3, C11_3, C12_3, C13_3 +//v30 C20_3, C21_3, C22_3, C23_3 +//v31 C30_3, C31_3, C32_3, C33_3 -// add sp,sp,#-(6*16) -// stp x18,x19,[sp,#(0*16)] -// stp x20,x21,[sp,#(1*16)] +/***************************** EXCEPT FOR 16x4 ********************************/ +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 ppA00, ppA01 +//v03 ppA02, ppA03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 ppA10, ppA11 +//v07 ppA12, ppA13 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 ppC00, ppC01 +//v19 ppC02, ppC03 +//v20 C10, C11 +//v21 C12, C13 +//v22 ppC10, ppC11 +//v23 ppC12, ppC13 +//v24 C20, C21 +//v25 C22, C23 +//v26 ppC20, ppC21 +//v27 ppC22, ppC23 +//v28 C30, C31 +//v29 C32, C33 +//v30 ppC30, ppC31 +//v31 ppC32, ppC33 - -/************************************************************************************** +/******************************************************************************* * Macro definitions -**************************************************************************************/ +*******************************************************************************/ + +.macro INIT16x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm + +.macro KERNEL16x4_I + ld1 {v8.4s}, [pB] + add pB, pB, #16 + + ld1 {v0.4s}, [pA_0] + add pA_0, pA_0, #16 + + fmul v16.4s, v0.4s, v8.4s[0] + fmul v20.4s, v0.4s, v8.4s[1] + + ld1 {v2.4s}, [pA_1] + add pA_1, pA_1, #16 + + fmul v24.4s, v0.4s, v8.4s[2] + fmul v28.4s, v0.4s, v8.4s[3] + + ld1 {v4.4s}, [pA_2] + add pA_2, pA_2, #16 + + fmul v17.4s, v2.4s, v8.4s[0] + fmul v21.4s, v2.4s, v8.4s[1] + + ld1 {v6.4s}, [pA_3] + add pA_3, pA_3, #16 + + fmul v25.4s, v2.4s, v8.4s[2] + fmul v29.4s, v2.4s, v8.4s[3] + + ld1 {v12.4s}, [pB] // for next round + add pB, pB, #16 + + fmul v18.4s, v4.4s, v8.4s[0] + fmul v19.4s, v6.4s, v8.4s[0] + + ld1 {v1.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 + + fmul v22.4s, v4.4s, v8.4s[1] + fmul v23.4s, v6.4s, v8.4s[1] + + ld1 {v3.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 + + fmul v26.4s, v4.4s, v8.4s[2] + fmul v27.4s, v6.4s, v8.4s[2] + + ld1 {v5.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 + + fmul v30.4s, v4.4s, v8.4s[3] + fmul v31.4s, v6.4s, v8.4s[3] + + ld1 {v7.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 +.endm + +.macro KERNEL16x4_M2 + fmla v16.4s, v1.4s, v12.4s[0] + fmla v17.4s, v3.4s, v12.4s[0] + + ld1 {v8.4s}, [pB] // for next round + add pB, pB, #16 + + fmla v18.4s, v5.4s, v12.4s[0] + fmla v19.4s, v7.4s, v12.4s[0] + + ld1 {v0.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 + + fmla v20.4s, v1.4s, v12.4s[1] + fmla v21.4s, v3.4s, v12.4s[1] + + ld1 {v2.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 + + fmla v22.4s, v5.4s, v12.4s[1] + fmla v23.4s, v7.4s, v12.4s[1] + + ld1 {v4.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 + + fmla v24.4s, v1.4s, v12.4s[2] + fmla v25.4s, v3.4s, v12.4s[2] + + ld1 {v6.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 + + fmla v26.4s, v5.4s, v12.4s[2] + fmla v27.4s, v7.4s, v12.4s[2] + + prfm PLDL1KEEP, [pA_2, #512] + + fmla v28.4s, v1.4s, v12.4s[3] + fmla v29.4s, v3.4s, v12.4s[3] + + prfm PLDL1KEEP, [pA_3, #512] + + fmla v30.4s, v5.4s, v12.4s[3] + fmla v31.4s, v7.4s, v12.4s[3] + + prfm PLDL1KEEP, [pB, #512] +.endm + +.macro KERNEL16x4_M1 + fmla v16.4s, v0.4s, v8.4s[0] + fmla v17.4s, v2.4s, v8.4s[0] + + ld1 {v12.4s}, [pB] // for next round + add pB, pB, #16 + + fmla v18.4s, v4.4s, v8.4s[0] + fmla v19.4s, v6.4s, v8.4s[0] + + ld1 {v1.4s}, [pA_0] // for next round + add pA_0, pA_0, #16 + + fmla v20.4s, v0.4s, v8.4s[1] + fmla v21.4s, v2.4s, v8.4s[1] + + ld1 {v3.4s}, [pA_1] // for next round + add pA_1, pA_1, #16 + + fmla v22.4s, v4.4s, v8.4s[1] + fmla v23.4s, v6.4s, v8.4s[1] + + ld1 {v5.4s}, [pA_2] // for next round + add pA_2, pA_2, #16 + + fmla v24.4s, v0.4s, v8.4s[2] + fmla v25.4s, v2.4s, v8.4s[2] + + ld1 {v7.4s}, [pA_3] // for next round + add pA_3, pA_3, #16 + + fmla v26.4s, v4.4s, v8.4s[2] + fmla v27.4s, v6.4s, v8.4s[2] + + prfm PLDL1KEEP, [pA_0, #512] + + fmla v28.4s, v0.4s, v8.4s[3] + fmla v29.4s, v2.4s, v8.4s[3] + + prfm PLDL1KEEP, [pA_1, #512] + + fmla v30.4s, v4.4s, v8.4s[3] + fmla v31.4s, v6.4s, v8.4s[3] +.endm + +.macro KERNEL16x4_E + fmla v16.4s, v1.4s, v12.4s[0] + fmla v17.4s, v3.4s, v12.4s[0] + fmla v18.4s, v5.4s, v12.4s[0] + fmla v19.4s, v7.4s, v12.4s[0] + fmla v20.4s, v1.4s, v12.4s[1] + fmla v21.4s, v3.4s, v12.4s[1] + fmla v22.4s, v5.4s, v12.4s[1] + fmla v23.4s, v7.4s, v12.4s[1] + fmla v24.4s, v1.4s, v12.4s[2] + fmla v25.4s, v3.4s, v12.4s[2] + fmla v26.4s, v5.4s, v12.4s[2] + fmla v27.4s, v7.4s, v12.4s[2] + fmla v28.4s, v1.4s, v12.4s[3] + fmla v29.4s, v3.4s, v12.4s[3] + fmla v30.4s, v5.4s, v12.4s[3] + fmla v31.4s, v7.4s, v12.4s[3] +.endm + +.macro KERNEL16x4_SUB + ld1 {v8.4s}, [pB] + add pB, pB, #16 + + ld1 {v0.4s}, [pA_0] + add pA_0, pA_0, #16 + + fmla v16.4s, v0.4s, v8.4s[0] + fmla v20.4s, v0.4s, v8.4s[1] + fmla v24.4s, v0.4s, v8.4s[2] + fmla v28.4s, v0.4s, v8.4s[3] + + ld1 {v2.4s}, [pA_1] + add pA_1, pA_1, #16 + + fmla v17.4s, v2.4s, v8.4s[0] + fmla v21.4s, v2.4s, v8.4s[1] + fmla v25.4s, v2.4s, v8.4s[2] + fmla v29.4s, v2.4s, v8.4s[3] + + ld1 {v4.4s}, [pA_2] + add pA_2, pA_2, #16 + + fmla v18.4s, v4.4s, v8.4s[0] + fmla v22.4s, v4.4s, v8.4s[1] + fmla v26.4s, v4.4s, v8.4s[2] + fmla v30.4s, v4.4s, v8.4s[3] + + ld1 {v6.4s}, [pA_3] + add pA_3, pA_3, #16 + + fmla v19.4s, v6.4s, v8.4s[0] + fmla v23.4s, v6.4s, v8.4s[1] + fmla v27.4s, v6.4s, v8.4s[2] + fmla v31.4s, v6.4s, v8.4s[3] +.endm + +.macro SAVE16x4 + mov pCRow1, pCRow0 + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + fmla v0.4s, v16.4s, alphaV0 + fmla v1.4s, v17.4s, alphaV1 + fmla v2.4s, v18.4s, alphaV2 + fmla v3.4s, v19.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v20.4s, alphaV0 + fmla v5.4s, v21.4s, alphaV1 + fmla v6.4s, v22.4s, alphaV2 + fmla v7.4s, v23.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + fmla v0.4s, v24.4s, alphaV0 + fmla v1.4s, v25.4s, alphaV1 + fmla v2.4s, v26.4s, alphaV2 + fmla v3.4s, v27.4s, alphaV3 + st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v4.4s, v28.4s, alphaV0 + fmla v5.4s, v29.4s, alphaV1 + fmla v6.4s, v30.4s, alphaV2 + fmla v7.4s, v31.4s, alphaV3 + st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT8x4 + fmov s16, wzr + fmov s17, s16 + fmov s18, s17 + fmov s19, s16 + fmov s20, s17 + fmov s21, s16 + fmov s22, s17 + fmov s23, s16 + fmov s24, s17 + fmov s25, s16 + fmov s26, s17 + fmov s27, s16 + fmov s28, s17 + fmov s29, s16 + fmov s30, s17 + fmov s31, s16 +.endm + +.macro KERNEL8x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v2.2s, v3.2s}, [pA_1] + add pA_1, pA_1, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] + + fmla v18.2s, v2.2s, v8.2s[0] + fmla v31.2s, v3.2s, v9.2s[1] + fmla v22.2s, v2.2s, v8.2s[1] + fmla v27.2s, v3.2s, v9.2s[0] + + fmla v26.2s, v2.2s, v9.2s[0] + fmla v23.2s, v3.2s, v8.2s[1] + fmla v30.2s, v2.2s, v9.2s[1] + fmla v19.2s, v3.2s, v8.2s[0] +.endm + +.macro SAVE8x4 + mov pCRow1, pCRow0 + + ld1 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v16.2s, alphaV0 + fmla v1.2s, v17.2s, alphaV1 + st1 {v0.2s, v1.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 + + ld1 {v2.2s, v3.2s}, [pCRow1] + fmla v2.2s, v18.2s, alphaV2 + fmla v3.2s, v19.2s, alphaV3 + st1 {v2.2s, v3.2s}, [pCRow1] + + ld1 {v4.2s, v5.2s}, [pCRow2] + fmla v4.2s, v20.2s, alphaV0 + fmla v5.2s, v21.2s, alphaV1 + st1 {v4.2s, v5.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + add pCRow2, pCRow2, #16 + + ld1 {v6.2s, v7.2s}, [pCRow2] + fmla v6.2s, v22.2s, alphaV2 + fmla v7.2s, v23.2s, alphaV3 + st1 {v6.2s, v7.2s}, [pCRow2] + + ld1 {v0.2s, v1.2s}, [pCRow1] + fmla v0.2s, v24.2s, alphaV0 + fmla v1.2s, v25.2s, alphaV1 + st1 {v0.2s, v1.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow1, #16 + + ld1 {v2.2s, v3.2s}, [pCRow1] + fmla v2.2s, v26.2s, alphaV2 + fmla v3.2s, v27.2s, alphaV3 + st1 {v2.2s, v3.2s}, [pCRow1] + + ld1 {v4.2s, v5.2s}, [pCRow2] + fmla v4.2s, v28.2s, alphaV0 + fmla v5.2s, v29.2s, alphaV1 + st1 {v4.2s, v5.2s}, [pCRow2] + + add pCRow2, pCRow2, #16 + + ld1 {v6.2s, v7.2s}, [pCRow2] + fmla v6.2s, v30.2s, alphaV2 + fmla v7.2s, v31.2s, alphaV3 + st1 {v6.2s, v7.2s}, [pCRow2] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ .macro INIT4x4 - - fsub v16.4s , v16.4s , v16.4s - fsub v20.4s , v20.4s , v20.4s - fsub v24.4s , v24.4s , v24.4s - fsub v28.4s , v28.4s , v28.4s - + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 .endm -.macro KERNEL4x4_I - - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s},[pA],#16 - - fmulx v16.4s, v0.4s, v8.4s[0] - fmulx v20.4s, v0.4s, v8.4s[1] - fmulx v24.4s, v0.4s, v10.4s[0] - fmulx v28.4s, v0.4s, v10.4s[1] - - ld1 {v12.2s},[pB],#8 // for next round - ld1 {v14.2s},[pB],#8 // for next round - ld1 {v4.4s},[pA],#16 // for next round - - -.endm - - -.macro KERNEL4x4_M2 - - fmla v16.4s, v4.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v24.4s, v4.4s, v14.s[0] - fmla v28.4s, v4.4s, v14.s[1] - - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s},[pA],#16 - -.endm - - -.macro KERNEL4x4_M1 - - fmla v16.4s, v0.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v24.4s, v0.4s, v10.s[0] - fmla v28.4s, v0.4s, v10.s[1] - - ld1 {v12.2s},[pB],#8 - ld1 {v14.2s},[pB],#8 - ld1 {v4.4s},[pA],#16 - -.endm - - - -.macro KERNEL4x4_E - - fmla v16.4s, v4.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v24.4s, v4.4s, v14.s[0] - fmla v28.4s, v4.4s, v14.s[1] - -.endm - - - - .macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 - ld1 {v8.2s},[pB],#8 - ld1 {v10.2s},[pB],#8 - ld1 {v0.4s} , [pA],#16 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] - fmla v16.4s, v0.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v24.4s, v0.4s, v10.s[0] - fmla v28.4s, v0.4s, v10.s[1] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] .endm - - - .macro SAVE4x4 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] - add pCRow1, pCRow0, LDC // create a second row pointer from the first row pointer - mov v0.d[0], tempALPHA + add pCRow1, pCRow0, LDC - ld1 {v8.4s},[pCRow0] // load 4 values of C from first row - fmla v8.4s ,v16.4s,v0.s[0] - st1 {v8.4s},[pCRow0],#16 // store C from first row + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] - ld1 {v12.4s},[pCRow1] // load 4 values of C from second row - fmla v12.4s ,v20.4s,v0.s[0] - st1 {v12.4s},[pCRow1] // store C from second row + add pCRow2, pCRow1, LDC - add pCRow2, pCRow1, LDC // Row2 points to third row + ld1 {v8.2s, v9.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV0 + fmla v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] - ld1 {v8.4s},[pCRow2] // load 4 values of C from third row - fmla v8.4s ,v24.4s,v0.s[0] - st1 {v8.4s} ,[pCRow2] // store C from third row + add pCRow1, pCRow2, LDC - add pCRow1, pCRow2 , LDC // row1 points to fourth row - - ld1 {v12.4s},[pCRow1] // load 4 values of C from fourth row - fmla v12.4s ,v28.4s,v0.s[0] - st1 {v12.4s},[pCRow1] // store fourth row + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV2 + fmla v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + add pCRow0, pCRow0, #16 .endm /******************************************************************************/ .macro INIT2x4 - - fsub s16 , s16 , s16 - fmov s17, s16 + fmov s16, wzr fmov s20, s16 - fmov s21, s16 - fmov s24, s16 - fmov s25, s16 + fmov s24, s20 fmov s28, s16 - fmov s29, s16 - .endm - - .macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA_0] + add pA_0, pA_0, #8 - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - ldr s10, [ pB, #8 ] - ldr s11, [ pB, #12 ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 - - fmadd s24 , s0, s10, s24 - fmadd s25 , s1, s10, s25 - - fmadd s28 , s0, s11, s28 - fmadd s29 , s1, s11, s29 - add pA , pA, #8 - add pB , pB, #16 - + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] .endm - #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1 - #define L1ST( op1, op2, op3) ldr op1, [op2, op3] - .macro SAVE2x4 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] - add pCRow1 , pCRow0, LDC - add pCRow2 , pCRow1, LDC - mov v0.d[0], tempALPHA + add pCRow1, pCRow0, LDC - L1ST ( s8,pCRow0, #0) - L1ST ( s9,pCRow0, #4 ) + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) + add pCRow2, pCRow1, LDC - str s8 , [pCRow0, #0] - str s9 , [pCRow0, #4 ] + ld1 {v8.2s}, [pCRow2] + fmla v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] - ldr s12, [pCRow1, #0] - ldr s13, [pCRow1, #4 ] + add pCRow1, pCRow2, LDC - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) - - str s12, [pCRow1, #0] - str s13, [pCRow1, #4 ] - - L1ST ( s8,pCRow2 , #0) - L1ST ( s9,pCRow2 , #4 ) - - F1ST ( s8 , s0 , s24) - F1ST ( s9 , s0 , s25) - - str s8 , [pCRow2 , #0] - str s9 , [pCRow2 , #4 ] - - add pCRow1, pCRow2 , LDC - - ldr s12, [pCRow1, #0] - ldr s13, [pCRow1, #4 ] - - F1ST ( s12, s0 , s28) - F1ST ( s13, s0 , s29) - - str s12, [pCRow1, #0] - str s13, [pCRow1, #4 ] + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 - .endm - /******************************************************************************/ .macro INIT1x4 - - fsub s16 , s16 , s16 + fmov s16, wzr fmov s20, s16 - fmov s24, s16 - fmov s28, s16 - .endm - - .macro KERNEL1x4_SUB + ldr s0, [pA_0] + add pA_0, pA_0, #4 - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - ldr s10, [ pB, #8 ] - ldr s11, [ pB, #12 ] - - ldr s0 , [ pA ] - - fmadd s16 , s0, s8, s16 - fmadd s20 , s0, s9, s20 - fmadd s24 , s0, s10, s24 - fmadd s28 , s0, s11, s28 - - add pA , pA, #4 - add pB , pB, #16 + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] .endm .macro SAVE1x4 + add pCRow1, pCRow0, LDC - add pCRow1 , pCRow0, LDC - add pCRow2 , pCRow1, LDC + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] - mov v0.d[0], tempALPHA + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC - L1ST ( s8,pCRow0, #0) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0, #0] - - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s20) - str s12, [pCRow1, #0] - - L1ST ( s8,pCRow2 , #0) - F1ST ( s8 , s0 , s24) - str s8 , [pCRow2 , #0] - - add pCRow1, pCRow2 , LDC - - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s28) - str s12, [pCRow1, #0] + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] add pCRow0, pCRow0, #4 - .endm -/******************************************************************************/ /******************************************************************************/ .macro INIT4x2 - - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s18, s16 - fmov s19, s16 - fmov s20, s16 - fmov s21, s16 - fmov s22, s16 - fmov s23, s16 - + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 .endm - - .macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0, pA_0, #16 - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - ldr s2 , [ pA, #8 ] - ldr s3 , [ pA, #12 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - fmadd s18 , s2, s8, s18 - fmadd s19 , s3, s8, s19 - - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 - fmadd s22 , s2, s9, s22 - fmadd s23 , s3, s9, s23 - - add pA , pA, #16 - add pB , pB, #8 - + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] .endm .macro SAVE4x2 + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] - add pCRow1 , pCRow0, LDC + add pCRow1, pCRow0, LDC - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0) - L1ST ( s9,pCRow0, #4 ) - L1ST ( s10,pCRow0, #8 ) - L1ST ( s11,pCRow0, #12 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - F1ST ( s10, s0 , s18) - F1ST ( s11, s0 , s19) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - str s10, [pCRow0, #8 ] - str s11, [pCRow0, #12 ] - - L1ST ( s12,pCRow1, #0) - L1ST ( s13,pCRow1, #4 ) - L1ST ( s14,pCRow1, #8 ) - L1ST ( s15,pCRow1, #12 ) - - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) - F1ST ( s14, s0 , s22) - F1ST ( s15, s0 , s23) - - str s12, [pCRow1] - str s13, [pCRow1, #4 ] - str s14, [pCRow1, #8 ] - str s15, [pCRow1, #12 ] + ld1 {v12.2s, v13.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV2 + fmla v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 - .endm - /******************************************************************************/ .macro INIT2x2 - - fsub s16 , s16 , s16 - fmov s17, s16 + fmov s16, wzr fmov s20, s16 - fmov s21, s16 - .endm - - .macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - - fmadd s20 , s0, s9, s20 - fmadd s21 , s1, s9, s21 - - add pA , pA, #8 - add pB , pB, #8 + ld1 {v0.2s}, [pA_0] + add pA_0, pA_0, #8 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] .endm .macro SAVE2x2 + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - - L1ST ( s12,pCRow1, #0 ) - L1ST ( s13,pCRow1, #4 ) - - F1ST ( s12, s0 , s20) - F1ST ( s13, s0 , s21) - - str s12, [pCRow1] - str s13, [pCRow1, #4 ] + ld1 {v12.2s}, [pCRow1] + fmla v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 - .endm /******************************************************************************/ .macro INIT1x2 - - fsub s16 , s16 , s16 - fmov s20, s16 - + fmov s16, wzr .endm - - .macro KERNEL1x2_SUB - - ldr s8 , [ pB ] - ldr s9 , [ pB, #4 ] - - ldr s0 , [ pA ] - fmadd s16 , s0, s8, s16 - fmadd s20 , s0, s9, s20 - - add pA , pA, #4 + ld1 {v8.2s} , [pB] add pB , pB, #8 + ldr s0 , [pA_0] + add pA_0, pA_0, #4 + + fmla v16.2s, v8.2s, v0.2s[0] .endm .macro SAVE1x2 - add pCRow1 , pCRow0, LDC - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0] - - L1ST ( s12,pCRow1, #0) - F1ST ( s12, s0 , s20) - str s12, [pCRow1] + ld1 {v8.s}[0], [pCRow0] + ld1 {v8.s}[1], [pCRow1] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] add pCRow0, pCRow0, #4 - .endm -/******************************************************************************/ /******************************************************************************/ .macro INIT4x1 - - fsub s16 , s16 , s16 - fmov s17, s16 - fmov s18, s16 - fmov s19, s16 - + fmov s16, wzr + fmov s17, s16 .endm - - .macro KERNEL4x1_SUB - - ldr s8 , [ pB ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - ldr s2 , [ pA, #8 ] - ldr s3 , [ pA, #12 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - fmadd s18 , s2, s8, s18 - fmadd s19 , s3, s8, s19 - - add pA , pA, #16 + ldr s8, [pB] add pB , pB, #4 + ld1 {v0.2s, v1.2s}, [pA_0] + add pA_0 , pA_0, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] .endm .macro SAVE4x1 - - - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) - L1ST ( s10,pCRow0, #8 ) - L1ST ( s11,pCRow0, #12 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - F1ST ( s10, s0 , s18) - F1ST ( s11, s0 , s19) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] - str s10, [pCRow0, #8 ] - str s11, [pCRow0, #12 ] + ld1 {v8.2s, v9.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + fmla v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 - .endm @@ -668,186 +815,271 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ .macro INIT2x1 - - fsub s16 , s16 , s16 - fmov s17, s16 - + fmov s16, wzr .endm - - .macro KERNEL2x1_SUB - - ldr s8 , [ pB ] - - ldr s0 , [ pA ] - ldr s1 , [ pA, #4 ] - - fmadd s16 , s0, s8, s16 - fmadd s17 , s1, s8, s17 - - add pA , pA, #8 + ldr s8, [pB] add pB , pB, #4 + ld1 {v0.2s}, [pA_0] + add pA_0 , pA_0, #8 + + fmla v16.2s, v0.2s, v8.2s[0] .endm .macro SAVE2x1 - - - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - L1ST ( s9,pCRow0, #4 ) - - F1ST ( s8 , s0 , s16) - F1ST ( s9 , s0 , s17) - - str s8 , [pCRow0] - str s9 , [pCRow0, #4 ] + ld1 {v8.2s}, [pCRow0] + fmla v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] add pCRow0, pCRow0, #8 - .endm /******************************************************************************/ .macro INIT1x1 - - fsub s16 , s16 , s16 - + fmov s16, wzr .endm - - .macro KERNEL1x1_SUB - - ldr s8 , [ pB ] - - ldr s0 , [ pA ] - - fmadd s16 , s0, s8, s16 - - add pA , pA, #4 + ldr s8, [pB] add pB , pB, #4 + ldr s0, [pA_0] + add pA_0 , pA_0, #4 + + fmadd s16, s0, s8, s16 .endm .macro SAVE1x1 - - - mov v0.d[0], tempALPHA - - L1ST ( s8,pCRow0, #0 ) - F1ST ( s8 , s0 , s16) - str s8 , [pCRow0] + ldr s8, [pCRow0] + fmadd s8, s16, alpha0, s8 + str s8, [pCRow0] add pCRow0, pCRow0, #4 - .endm - - - - -/************************************************************************************** +/******************************************************************************* * End of macro definitions -**************************************************************************************/ +*******************************************************************************/ PROLOGUE .align 5 - add sp,sp,#-(5*16) - stp d8,d9,[sp,#(0*16)] - stp d10,d11,[sp,#(1*16)] - stp d12,d13,[sp,#(2*16)] - stp d14,d15,[sp,#(3*16)] - stp d16,d17,[sp,#(4*16)] + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] - mov tempALPHA, v0.d[0] - lsl LDC, LDC, #2 // ldc = ldc * 4 + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 mov pB, origPB mov counterJ, origN - asr counterJ, counterJ, #2 // J = J / 4 + asr counterJ, counterJ, #2 // J = J / 4 cmp counterJ, #0 ble sgemm_kernel_L2_BEGIN +/******************************************************************************/ + sgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 - mov pCRow0, pC // pCRow0 = C - add pC,pC,LDC, lsl #2 + lsl temp, origK, #4 // k * 4 * 4 + mov pA_0, origPA // pA_0 = start of A array + add pA_1, temp, pA_0 + add pA_2, temp, pA_1 + add pA_3, temp, pA_2 - mov pA, origPA // pA = start of A array - - - -sgemm_kernel_L4_M4_BEGIN: +sgemm_kernel_L4_M16_BEGIN: mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 + asr counterI, counterI, #4 // counterI = counterI / 16 cmp counterI, #0 + ble sgemm_kernel_L4_M8_BEGIN + +sgemm_kernel_L4_M16_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt sgemm_kernel_L4_M16_32 + + KERNEL16x4_I // do one in the K + KERNEL16x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble sgemm_kernel_L4_M16_22a + .align 5 + +sgemm_kernel_L4_M16_22: + + KERNEL16x4_M1 + KERNEL16x4_M2 + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M16_22 + + +sgemm_kernel_L4_M16_22a: + + KERNEL16x4_M1 + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + +sgemm_kernel_L4_M16_32: + + tst counterL, #1 + ble sgemm_kernel_L4_M16_40 + + KERNEL16x4_I + + KERNEL16x4_E + + b sgemm_kernel_L4_M16_44 + + +sgemm_kernel_L4_M16_40: + + INIT16x4 + +sgemm_kernel_L4_M16_44: + + ands counterL , origK, #1 + ble sgemm_kernel_L4_M16_100 + +sgemm_kernel_L4_M16_46: + + KERNEL16x4_SUB + +sgemm_kernel_L4_M16_100: + + SAVE16x4 + +sgemm_kernel_L4_M16_END: + lsl temp, origK, #4 // k * 4 * 4 = Four rows of A + add pA_0, pA_0, temp + add pA_0, pA_0, temp + add pA_0, pA_0, temp + add pA_1, pA_0, temp + add pA_2, pA_1, temp + add pA_3, pA_2, temp + subs counterI, counterI, #1 + bne sgemm_kernel_L4_M16_20 + +sgemm_kernel_L4_M8_BEGIN: + mov counterI, origM + tst counterI , #15 + ble sgemm_kernel_L4_END + + tst counterI, #8 + ble sgemm_kernel_L4_M4_BEGIN + +sgemm_kernel_L4_M8_20: + + INIT8x4 + + mov pB, origPB + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L4_M8_40 + +sgemm_kernel_L4_M8_22: + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + KERNEL8x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_22 + + +sgemm_kernel_L4_M8_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble sgemm_kernel_L4_M8_100 + +sgemm_kernel_L4_M8_42: + + KERNEL8x4_SUB + + subs counterL, counterL, #1 + bgt sgemm_kernel_L4_M8_42 + +sgemm_kernel_L4_M8_100: + + SAVE8x4 + +sgemm_kernel_L4_M8_END: + lsl temp, origK, #4 // k * 4 * 4 + add pA_0, pA_0, temp + +sgemm_kernel_L4_M4_BEGIN: + mov counterI, origM + tst counterI , #7 + ble sgemm_kernel_L4_END + + tst counterI, #4 ble sgemm_kernel_L4_M2_BEGIN sgemm_kernel_L4_M4_20: + INIT4x4 + mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? - blt sgemm_kernel_L4_M4_32 - - - - KERNEL4x4_I //do one in the K - KERNEL4x4_M2 //do another in the K - - subs counterL, counterL, #2 // subtract 2, since one is always done at the tail - ble sgemm_kernel_L4_M4_22a - .align 5 + asr counterL, origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble sgemm_kernel_L4_M4_40 sgemm_kernel_L4_M4_22: - KERNEL4x4_M1 - KERNEL4x4_M2 + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB + KERNEL4x4_SUB subs counterL, counterL, #1 bgt sgemm_kernel_L4_M4_22 -sgemm_kernel_L4_M4_22a: - - KERNEL4x4_M1 - KERNEL4x4_E - - b sgemm_kernel_L4_M4_44 - -sgemm_kernel_L4_M4_32: // less than 4 to do in the K direction - - tst counterL, #1 - ble sgemm_kernel_L4_M4_40 - - KERNEL4x4_I - - KERNEL4x4_E - - b sgemm_kernel_L4_M4_44 - sgemm_kernel_L4_M4_40: - INIT4x4 - - -sgemm_kernel_L4_M4_44: - - ands counterL , origK, #1 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M4_100 -sgemm_kernel_L4_M4_46: +sgemm_kernel_L4_M4_42: KERNEL4x4_SUB subs counterL, counterL, #1 - bne sgemm_kernel_L4_M4_46 + bgt sgemm_kernel_L4_M4_42 sgemm_kernel_L4_M4_100: @@ -855,9 +1087,6 @@ sgemm_kernel_L4_M4_100: sgemm_kernel_L4_M4_END: - subs counterI, counterI, #1 - bne sgemm_kernel_L4_M4_20 - sgemm_kernel_L4_M2_BEGIN: @@ -865,7 +1094,7 @@ sgemm_kernel_L4_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L4_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L4_M1_BEGIN sgemm_kernel_L4_M2_20: @@ -873,7 +1102,7 @@ sgemm_kernel_L4_M2_20: INIT2x4 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M2_40 @@ -895,7 +1124,7 @@ sgemm_kernel_L4_M2_22: sgemm_kernel_L4_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M2_100 sgemm_kernel_L4_M2_42: @@ -914,7 +1143,7 @@ sgemm_kernel_L4_M2_END: sgemm_kernel_L4_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L4_END sgemm_kernel_L4_M1_20: @@ -922,7 +1151,7 @@ sgemm_kernel_L4_M1_20: INIT1x4 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L4_M1_40 @@ -943,7 +1172,7 @@ sgemm_kernel_L4_M1_22: sgemm_kernel_L4_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L4_M1_100 sgemm_kernel_L4_M1_42: @@ -960,35 +1189,36 @@ sgemm_kernel_L4_M1_100: sgemm_kernel_L4_END: - add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 - subs counterJ, counterJ , #1 // j-- + subs counterJ, counterJ , #1 // j-- bgt sgemm_kernel_L4_BEGIN - -/*********************************************************************************************/ +/******************************************************************************/ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction mov counterJ , origN tst counterJ , #3 - ble sgemm_kernel_L999 // error, N was less than 4? + ble sgemm_kernel_L999 tst counterJ , #2 ble sgemm_kernel_L1_BEGIN - mov pCRow0, pC // pCRow0 = pC - add pC , pC, LDC, lsl #1 + mov pCRow0, pC // pCRow0 = pC - mov pA, origPA // pA = A + add pC,pC,LDC, lsl #1 + + mov pA_0, origPA // pA_0 = A sgemm_kernel_L2_M4_BEGIN: mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 + asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI,#0 ble sgemm_kernel_L2_M2_BEGIN @@ -997,7 +1227,7 @@ sgemm_kernel_L2_M4_20: INIT4x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M4_40 .align 5 @@ -1019,7 +1249,7 @@ sgemm_kernel_L2_M4_22: sgemm_kernel_L2_M4_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M4_100 sgemm_kernel_L2_M4_42: @@ -1045,7 +1275,7 @@ sgemm_kernel_L2_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L2_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L2_M1_BEGIN sgemm_kernel_L2_M2_20: @@ -1053,7 +1283,7 @@ sgemm_kernel_L2_M2_20: INIT2x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL,#0 ble sgemm_kernel_L2_M2_40 @@ -1075,7 +1305,7 @@ sgemm_kernel_L2_M2_22: sgemm_kernel_L2_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M2_100 sgemm_kernel_L2_M2_42: @@ -1094,7 +1324,7 @@ sgemm_kernel_L2_M2_END: sgemm_kernel_L2_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L2_END sgemm_kernel_L2_M1_20: @@ -1102,7 +1332,7 @@ sgemm_kernel_L2_M1_20: INIT1x2 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL, #0 ble sgemm_kernel_L2_M1_40 @@ -1123,7 +1353,7 @@ sgemm_kernel_L2_M1_22: sgemm_kernel_L2_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L2_M1_100 sgemm_kernel_L2_M1_42: @@ -1139,9 +1369,9 @@ sgemm_kernel_L2_M1_100: sgemm_kernel_L2_END: - add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 -/*********************************************************************************************/ +/******************************************************************************/ sgemm_kernel_L1_BEGIN: @@ -1150,17 +1380,17 @@ sgemm_kernel_L1_BEGIN: ble sgemm_kernel_L999 // done - mov pCRow0, pC // pCRow0 = C - add pC , pCRow0 , LDC // C01 is the current line, update pC to point to next + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next - mov pA, origPA // pA = A + mov pA_0, origPA // pA_0 = A sgemm_kernel_L1_M4_BEGIN: mov counterI, origM - asr counterI, counterI, #2 // counterI = counterI / 4 + asr counterI, counterI, #2 // counterI = counterI / 4 cmp counterI, #0 ble sgemm_kernel_L1_M2_BEGIN @@ -1169,7 +1399,7 @@ sgemm_kernel_L1_M4_20: INIT4x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M4_40 .align 5 @@ -1191,7 +1421,7 @@ sgemm_kernel_L1_M4_22: sgemm_kernel_L1_M4_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M4_100 sgemm_kernel_L1_M4_42: @@ -1217,7 +1447,7 @@ sgemm_kernel_L1_M2_BEGIN: tst counterI , #3 ble sgemm_kernel_L1_END - tst counterI, #2 // counterI = counterI / 2 + tst counterI, #2 // counterI = counterI / 2 ble sgemm_kernel_L1_M1_BEGIN sgemm_kernel_L1_M2_20: @@ -1225,7 +1455,7 @@ sgemm_kernel_L1_M2_20: INIT2x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M2_40 @@ -1247,7 +1477,7 @@ sgemm_kernel_L1_M2_22: sgemm_kernel_L1_M2_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M2_100 sgemm_kernel_L1_M2_42: @@ -1266,7 +1496,7 @@ sgemm_kernel_L1_M2_END: sgemm_kernel_L1_M1_BEGIN: - tst counterI, #1 // counterI = counterI % 2 + tst counterI, #1 // counterI = counterI % 2 ble sgemm_kernel_L1_END sgemm_kernel_L1_M1_20: @@ -1274,7 +1504,7 @@ sgemm_kernel_L1_M1_20: INIT1x1 mov pB, origPB - asr counterL , origK, #3 // counterL = counterL / 8 + asr counterL , origK, #3 // counterL = counterL / 8 cmp counterL , #0 ble sgemm_kernel_L1_M1_40 @@ -1295,7 +1525,7 @@ sgemm_kernel_L1_M1_22: sgemm_kernel_L1_M1_40: - ands counterL , origK, #7 // counterL = counterL % 8 + ands counterL , origK, #7 // counterL = counterL % 8 ble sgemm_kernel_L1_M1_100 sgemm_kernel_L1_M1_42: @@ -1314,13 +1544,19 @@ sgemm_kernel_L1_END: sgemm_kernel_L999: - mov x0, #0 // set return value - ldp d8,d9,[sp,#(0*16)] - ldp d10,d11,[sp,#(1*16)] - ldp d12,d13,[sp,#(2*16)] - ldp d14,d15,[sp,#(3*16)] - ldp d16,d17,[sp,#(4*16)] - add sp,sp,#(5*16) + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) ret EPILOGUE diff --git a/kernel/arm64/snrm2.S b/kernel/arm64/snrm2.S new file mode 100644 index 000000000..02c23a15f --- /dev/null +++ b/kernel/arm64/snrm2.S @@ -0,0 +1,178 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define TMPF s6 +#define SSQ s0 +#define TMPVF {v6.s}[0] +#define SZ 4 + +/******************************************************************************/ + +.macro INIT_F1 + ldr TMPF, [X], #SZ + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_F1 + ldr TMPF, [X], #SZ + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +.macro INIT_F4 + ld1 {v1.4s}, [X], #16 + fmul v1.4s, v1.4s, v1.4s + ext v2.16b, v1.16b, v1.16b, #8 + fadd v2.2s, v1.2s, v2.2s + faddp SSQ, v2.2s +.endm + +.macro KERNEL_F4 + ld1 {v1.4s}, [X], #16 + fmul v1.4s, v1.4s, v1.4s + ext v2.16b, v1.16b, v1.16b, #8 + fadd v2.2s, v1.2s, v2.2s + faddp TMPF, v2.2s + fadd SSQ, SSQ, TMPF +.endm + +.macro INIT_S + lsl INC_X, INC_X, #2 + ld1 TMPVF, [X], INC_X + fmul SSQ, TMPF, TMPF +.endm + +.macro KERNEL_S1 + ld1 TMPVF, [X], INC_X + fmul TMPF, TMPF, TMPF + fadd SSQ, SSQ, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble nrm2_kernel_zero + cmp INC_X, xzr + ble nrm2_kernel_zero + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq nrm2_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq nrm2_kernel_F1 + +nrm2_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne nrm2_kernel_F4 + +nrm2_kernel_F1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_F1_INIT: + INIT_F1 + subs N, N, #1 + b nrm2_kernel_F1 + +nrm2_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble nrm2_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble nrm2_kernel_S1 + +nrm2_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S4 + +nrm2_kernel_S1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + ret + +nrm2_kernel_zero: + fmov SSQ, wzr + + ret + + EPILOGUE diff --git a/kernel/arm64/strmm_kernel_4x4.S b/kernel/arm64/strmm_kernel_4x4.S new file mode 100644 index 000000000..674e200d8 --- /dev/null +++ b/kernel/arm64/strmm_kernel_4x4.S @@ -0,0 +1,1405 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define temp x16 +#define tempOffset x17 +#define tempK x18 + +#define alpha0 s10 +#define alphaV0 v10.s[0] +#define alpha1 s11 +#define alphaV1 v11.s[0] +#define alpha2 s14 +#define alphaV2 v14.s[0] +#define alpha3 s15 +#define alphaV3 v15.s[0] + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 temp +// 17 tempOffset +// 18 must save tempK +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA00, pA01 +//v01 pA02, pA03 +//v02 +//v03 +//v04 pA10, pA11 +//v05 pA12, pA13 +//v06 +//v07 +//v08 must save pB00, pB01 +//v09 must save pB02, pB03 +//v10 must save ALPHA0 +//v11 must save ALPHA1 +//v12 must save pB10, pB11 +//v13 must save pB12, pB13 +//v14 must save ALPHA2 +//v15 must save ALPHA3 +//v16 must save C00, C01 +//v17 must save C02, C03 +//v18 +//v19 +//v20 C10, C11 +//v21 C12, C13 +//v22 +//v23 +//v24 C20, C21 +//v25 C22, C23 +//v26 +//v27 +//v28 C30, C31 +//v29 C32, C33 +//v30 +//v31 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 + fmov s24, s17 + fmov s25, s16 + fmov s28, s17 + fmov s29, s16 +.endm + +.macro KERNEL4x4_I + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmul v16.2s, v0.2s, v8.2s[0] + fmul v29.2s, v1.2s, v9.2s[1] + + fmul v20.2s, v0.2s, v8.2s[1] + fmul v25.2s, v1.2s, v9.2s[0] + + fmul v24.2s, v0.2s, v9.2s[0] + fmul v21.2s, v1.2s, v8.2s[1] + + fmul v28.2s, v0.2s, v9.2s[1] + fmul v17.2s, v1.2s, v8.2s[0] + + ld1 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + ld1 {v4.2s, v5.2s}, [pA] + add pA, pA, #16 +.endm + +.macro KERNEL4x4_M1 + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + ld1 {v12.2s, v13.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + ld1 {v4.2s, v5.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + prfm PLDL1KEEP, [pB, #512] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro KERNEL4x4_M2 + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + ld1 {v8.2s, v9.2s}, [pB] // For next round + add pB, pB, #16 + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + ld1 {v0.2s, v1.2s}, [pA] // For next round + add pA, pA, #16 + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + prfm PLDL1KEEP, [pA, #512] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_E + fmla v16.2s, v4.2s, v12.2s[0] + fmla v29.2s, v5.2s, v13.2s[1] + + fmla v20.2s, v4.2s, v12.2s[1] + fmla v25.2s, v5.2s, v13.2s[0] + + fmla v24.2s, v4.2s, v13.2s[0] + fmla v21.2s, v5.2s, v12.2s[1] + + fmla v28.2s, v4.2s, v13.2s[1] + fmla v17.2s, v5.2s, v12.2s[0] +.endm + +.macro KERNEL4x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v29.2s, v1.2s, v9.2s[1] + + fmla v20.2s, v0.2s, v8.2s[1] + fmla v25.2s, v1.2s, v9.2s[0] + + fmla v24.2s, v0.2s, v9.2s[0] + fmla v21.2s, v1.2s, v8.2s[1] + + fmla v28.2s, v0.2s, v9.2s[1] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x4 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV0 + fmul v9.2s, v25.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV2 + fmul v13.2s, v29.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov s16, wzr + fmov s20, s16 + fmov s24, s20 + fmov s28, s16 +.endm + +.macro KERNEL2x4_SUB + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v24.2s, v0.2s, v9.2s[0] + fmla v28.2s, v0.2s, v9.2s[1] +.endm + +.macro SAVE2x4 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow2, pCRow1, LDC + fmul v8.2s, v24.2s, alphaV2 + st1 {v8.2s}, [pCRow2] + + add pCRow1, pCRow2, LDC + fmul v12.2s, v28.2s, alphaV3 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL1x4_SUB + ldr s0, [pA] + add pA, pA, #4 + + ld1 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + + fmla v16.2s, v8.2s, v0.s[0] + fmla v20.2s, v9.2s, v0.s[0] +.endm + +.macro SAVE1x4 + add pCRow1, pCRow0, LDC + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow2, pCRow1, LDC + add pCRow1, pCRow2, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov s16, wzr + fmov s17, s16 + fmov s20, s17 + fmov s21, s16 +.endm + +.macro KERNEL4x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + ld1 {v0.2s, v1.2s}, [pA] + add pA, pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] + fmla v21.2s, v1.2s, v8.2s[1] +.endm + +.macro SAVE4x2 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow1, pCRow0, LDC + fmul v12.2s, v20.2s, alphaV2 + fmul v13.2s, v21.2s, alphaV3 + st1 {v12.2s, v13.2s}, [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov s16, wzr + fmov s20, s16 +.endm + +.macro KERNEL2x2_SUB + ld1 {v8.2s}, [pB] + add pB, pB, #8 + + ld1 {v0.2s}, [pA] + add pA, pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v20.2s, v0.2s, v8.2s[1] +.endm + +.macro SAVE2x2 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow1 , pCRow0, LDC + fmul v12.2s, v20.2s, alphaV1 + st1 {v12.2s}, [pCRow1] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov s16, wzr +.endm + +.macro KERNEL1x2_SUB + ld1 {v8.2s} , [pB] + add pB , pB, #8 + + ldr s0 , [pA] + add pA, pA, #4 + + fmla v16.2s, v8.2s, v0.2s[0] +.endm + +.macro SAVE1x2 + add pCRow1 , pCRow0, LDC + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.s}[0], [pCRow0] + st1 {v8.s}[1], [pCRow1] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov s16, wzr + fmov s17, s16 +.endm + +.macro KERNEL4x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s, v1.2s}, [pA] + add pA , pA, #16 + + fmla v16.2s, v0.2s, v8.2s[0] + fmla v17.2s, v1.2s, v8.2s[0] +.endm + +.macro SAVE4x1 + fmul v8.2s, v16.2s, alphaV0 + fmul v9.2s, v17.2s, alphaV1 + st1 {v8.2s, v9.2s}, [pCRow0] + + add pCRow0, pCRow0, #16 +.endm + + + + +/******************************************************************************/ + +.macro INIT2x1 + fmov s16, wzr +.endm + +.macro KERNEL2x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ld1 {v0.2s}, [pA] + add pA , pA, #8 + + fmla v16.2s, v0.2s, v8.2s[0] +.endm + +.macro SAVE2x1 + fmul v8.2s, v16.2s, alphaV0 + st1 {v8.2s}, [pCRow0] + + add pCRow0, pCRow0, #8 +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov s16, wzr +.endm + +.macro KERNEL1x1_SUB + ldr s8, [pB] + add pB , pB, #4 + + ldr s0, [pA] + add pA , pA, #4 + + fmadd s16, s0, s8, s16 +.endm + +.macro SAVE1x1 + fmul s8, s16, alpha0 + str s8, [pCRow0] + + add pCRow0, pCRow0, #4 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +strmm_kernel_begin: + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha0, s0 + fmov alpha1, s0 + fmov alpha2, s0 + fmov alpha3, s0 + + lsl LDC, LDC, #2 // ldc = ldc * 4 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble strmm_kernel_L2_BEGIN + +/******************************************************************************/ + +strmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = start of A array + +strmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble strmm_kernel_L4_M2_BEGIN + +strmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt strmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble strmm_kernel_L4_M4_22a + .align 5 + +strmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M4_22 + +strmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_32: + + tst counterL, #1 + ble strmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b strmm_kernel_L4_M4_44 + +strmm_kernel_L4_M4_40: + + INIT4x4 + +strmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble strmm_kernel_L4_M4_100 + +strmm_kernel_L4_M4_46: + + KERNEL4x4_SUB + +strmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne strmm_kernel_L4_M4_20 + +strmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L4_M1_BEGIN + +strmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pA, pA, temp + lsl temp, tempOffset, #4 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M2_40 + +strmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_22 + + +strmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M2_100 + +strmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M2_42 + +strmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + +strmm_kernel_L4_M2_END: + + +strmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L4_END + +strmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L4_M1_40 + +strmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_22 + + +strmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L4_M1_100 + +strmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L4_M1_42 + +strmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +strmm_kernel_L4_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt strmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +strmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble strmm_kernel_L999 + + tst counterJ , #2 + ble strmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +strmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble strmm_kernel_L2_M2_BEGIN + +strmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M4_40 + .align 5 + +strmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_22 + + +strmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M4_100 + +strmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M4_42 + +strmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L2_M4_20 + + +strmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L2_M1_BEGIN + +strmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble strmm_kernel_L2_M2_40 + +strmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_22 + + +strmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M2_100 + +strmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M2_42 + +strmm_kernel_L2_M2_100: + + SAVE2x2 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +strmm_kernel_L2_M2_END: + + +strmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L2_END + +strmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #3 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble strmm_kernel_L2_M1_40 + +strmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_22 + + +strmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L2_M1_100 + +strmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L2_M1_42 + +strmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #3 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + +strmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ + +strmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble strmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +strmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble strmm_kernel_L1_M2_BEGIN + +strmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M4_40 + .align 5 + +strmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_22 + + +strmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M4_100 + +strmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M4_42 + +strmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +strmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt strmm_kernel_L1_M4_20 + + +strmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble strmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble strmm_kernel_L1_M1_BEGIN + +strmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #3 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M2_40 + +strmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_22 + + +strmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M2_100 + +strmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M2_42 + +strmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #3 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + +strmm_kernel_L1_M2_END: + + +strmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble strmm_kernel_L1_END + +strmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #2 + add pB, pB, temp + lsl temp, tempOffset, #2 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble strmm_kernel_L1_M1_40 + +strmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_22 + + +strmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble strmm_kernel_L1_M1_100 + +strmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt strmm_kernel_L1_M1_42 + +strmm_kernel_L1_M1_100: + + SAVE1x1 + +#if 0 +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #2 + add pA, pA, temp + lsl temp, tempK, #2 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +#endif + +strmm_kernel_L1_END: + +#if 0 +#if !defined(LEFT) + add tempOffset, tempOffset, #1 +#endif +#endif + +strmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/swap.S b/kernel/arm64/swap.S new file mode 100644 index 000000000..37ed83f2a --- /dev/null +++ b/kernel/arm64/swap.S @@ -0,0 +1,266 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define TMP0 s0 +#define TMPV0 {v0.s}[0] +#define TMP1 s1 +#define TMPV1 {v1.s}[0] +#define SZ 4 +#else +#define TMP0 d0 +#define TMPV0 {v0.d}[0] +#define TMP1 d1 +#define TMPV1 {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(COMPLEX) + ldr TMP0, [X] + ldr TMP1, [Y] + str TMP0, [Y], #SZ + str TMP1, [X], #SZ +#else +#if !defined(DOUBLE) + ld1 {v0.2s}, [X] + ld1 {v1.2s}, [Y] + st1 {v0.2s}, [Y], #8 + st1 {v1.2s}, [X], #8 +#else + ld1 {v0.2d}, [X] + ld1 {v1.2d}, [Y] + st1 {v0.2d}, [Y], #16 + st1 {v1.2d}, [X], #16 +#endif +#endif + +.endm + +.macro KERNEL_F8 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#else // DOUBLE + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#endif +#else // COMPLEX +#if !defined(DOUBLE) + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#else // DOUBLE + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 + ld1 {v0.4s, v1.4s}, [X] + ld1 {v2.4s, v3.4s}, [Y] + st1 {v0.4s, v1.4s}, [Y], #32 + st1 {v2.4s, v3.4s}, [X], #32 +#endif +#endif + +.endm + +.macro INIT_S + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + lsl INC_X, INC_X, #2 + lsl INC_Y, INC_Y, #2 +#else + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#endif +#else +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(COMPLEX) +#if !defined(DOUBLE) + ldr w10, [X] + ldr w11, [Y] + str w10, [Y] + str w11, [X] +#else + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] +#endif +#else +#if !defined(DOUBLE) + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] +#else + ldr x10, [X] + ldr x11, [Y] + str x10, [Y] + str x11, [X] + + ldr x12, [X, #8] + ldr x13, [Y, #8] + str x12, [Y, #8] + str x13, [X, #8] +#endif +#endif + add Y, Y, INC_Y + add X, X, INC_X +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble swap_kernel_L999 + + cmp INC_X, #1 + bne swap_kernel_S_BEGIN + cmp INC_Y, #1 + bne swap_kernel_S_BEGIN + +swap_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq swap_kernel_F1 + +swap_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne swap_kernel_F8 + +swap_kernel_F1: + + ands I, N, #7 + ble swap_kernel_L999 + +swap_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne swap_kernel_F10 + + b swap_kernel_L999 + + +swap_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble swap_kernel_S1 + +swap_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S4 + +swap_kernel_S1: + + ands I, N, #3 + ble swap_kernel_L999 + +swap_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne swap_kernel_S10 + +swap_kernel_L999: + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zamax.S b/kernel/arm64/zamax.S new file mode 100644 index 000000000..7db339f53 --- /dev/null +++ b/kernel/arm64/zamax.S @@ -0,0 +1,273 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if defined(USE_MIN) +#define COND le +#else +#define COND ge +#endif + +#if !defined(DOUBLE) +#define REG0 wzr +#define MAXF s0 +#define TMPF s1 +#define TMPVF {v1.s}[0] +#define SZ 4 +#else +#define REG0 xzr +#define MAXF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT_F1 +#if !defined(DOUBLE) + ld1 {v0.2s}, [X], #8 + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + ld1 {v0.2d}, [X], #16 + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], #8 + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], #16 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_F4 +#if !defined(DOUBLE) + ld2 {v0.4s,v1.4s}, [X], #32 + fabs v0.4s, v0.4s // [X6, X4, X2, X0] + fabs v1.4s, v1.4s // [X7, X5, X3, X1] + fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0] +#if defined(USE_MIN) + fminv MAXF, v0.4s +#else + fmaxv MAXF, v0.4s +#endif +#else // DOUBLE + ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64 + fabs v0.2d, v0.2d + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fadd v0.2d, v0.2d, v1.2d + fadd v2.2d, v2.2d, v3.2d +#if defined(USE_MIN) + fmin v0.2d, v0.2d, v2.2d + fminp MAXF, v0.2d +#else + fmax v0.2d, v0.2d, v2.2d + fmaxp MAXF, v0.2d +#endif +#endif +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + ld2 {v1.4s,v2.4s}, [X], #32 + fabs v1.4s, v1.4s // [X6, X4, X2, X0] + fabs v2.4s, v2.4s // [X7, X5, X3, X1] + fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0] +#if defined(USE_MIN) + fminv TMPF, v1.4s +#else + fmaxv TMPF, v1.4s +#endif +#else // DOUBLE + ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64 + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d +#if defined(USE_MIN) + fmin v1.2d, v1.2d, v3.2d + fminp MAXF, v1.2d +#else + fmax v1.2d, v1.2d, v3.2d + fmaxp MAXF, v1.2d +#endif +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v0.2s}, [X], INC_X + fabs v0.2s, v0.2s + ext v1.8b, v0.8b, v0.8b, #4 + fadd MAXF, s0, s1 +#else + lsl INC_X, INC_X, #4 + ld1 {v0.2d}, [X], INC_X + fabs v0.2d, v0.2d + faddp MAXF, v0.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + fabs v1.2s, v1.2s + ext v2.8b, v1.8b, v1.8b, #4 + fadd TMPF, s1, s2 +#else + ld1 {v1.2d}, [X], INC_X + fabs v1.2d, v1.2d + faddp TMPF, v1.2d +#endif + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble amax_kernel_zero + cmp INC_X, xzr + ble amax_kernel_zero + + cmp INC_X, #1 + bne amax_kernel_S_BEGIN + +amax_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq amax_kernel_F1_INIT + + INIT_F4 + subs I, I, #1 + beq amax_kernel_F1 + +amax_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne amax_kernel_F4 + +amax_kernel_F1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne amax_kernel_F10 + + ret + +amax_kernel_F1_INIT: + + INIT_F1 + subs N, N, #1 + b amax_kernel_F1 + +amax_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble amax_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble amax_kernel_S1 + +amax_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S4 + +amax_kernel_S1: + + ands I, N, #3 + ble amax_kernel_L999 + +amax_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne amax_kernel_S10 + +amax_kernel_L999: + + ret + +amax_kernel_zero: + + fmov MAXF, REG0 + ret + + EPILOGUE diff --git a/kernel/arm64/zasum.S b/kernel/arm64/zasum.S new file mode 100644 index 000000000..bf586d367 --- /dev/null +++ b/kernel/arm64/zasum.S @@ -0,0 +1,164 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#define REG0 xzr +#define SUMF d0 +#define TMPF d1 +#define TMPVF {v1.d}[0] +#define SZ 8 + +/******************************************************************************/ + +.macro KERNEL_F1 + ld1 {v1.2d}, [X], #16 + fabs v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +.macro KERNEL_F4 + ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 + fabs v1.2d, v1.2d + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + + fadd v1.2d, v1.2d, v2.2d + fadd v3.2d, v3.2d, v4.2d + + fadd v0.2d, v0.2d, v1.2d + fadd v0.2d, v0.2d, v3.2d + + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F4_FINALIZE + faddp SUMF, v0.2d +.endm + +.macro INIT_S + lsl INC_X, INC_X, #4 +.endm + +.macro KERNEL_S1 + ld1 {v1.2d}, [X], INC_X + fabs v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SUMF, SUMF, TMPF +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov SUMF, REG0 + + cmp N, xzr + ble asum_kernel_L999 + cmp INC_X, xzr + ble asum_kernel_L999 + + cmp INC_X, #1 + bne asum_kernel_S_BEGIN + +asum_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq asum_kernel_F1 + +asum_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne asum_kernel_F4 + + KERNEL_F4_FINALIZE + +asum_kernel_F1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne asum_kernel_F10 + +asum_kernel_L999: + ret + +asum_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble asum_kernel_S1 + +asum_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S4 + +asum_kernel_S1: + + ands I, N, #3 + ble asum_kernel_L999 + +asum_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne asum_kernel_S10 + + ret + + EPILOGUE diff --git a/kernel/arm64/zaxpy.S b/kernel/arm64/zaxpy.S new file mode 100644 index 000000000..4cc952bbf --- /dev/null +++ b/kernel/arm64/zaxpy.S @@ -0,0 +1,301 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define Y x5 /* Y vector address */ +#define INC_Y x6 /* Y stride */ +#define I x1 /* loop variable */ +#define Y_COPY x7 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA_R s0 /* scale input value */ +#define DA_I s1 /* scale input value */ +#define TMPX v2.2s +#define TMPY v3.2s +#define SZ 4 +#else +#define DA_R d0 /* scale input value */ +#define DA_I d1 /* scale input value */ +#define TMPX v2.2d +#define TMPY v3.2d +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(CONJ) +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R + fneg s2, DA_I + ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I + ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R + fneg d2, DA_I + ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I + ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I +#endif +#else +#if !defined(DOUBLE) + fneg s2, DA_R + ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R + ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I +#else + fneg d2, DA_R + ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R + ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I +#endif +#endif + +.endm + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] + fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2s}, [Y], #8 +#else + ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] + fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2d}, [Y], #16 +#endif + +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + // Replicate the lower 2 floats into the upper 2 slots + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R + ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] + // V3 = X[7], X[6], X[5], X[4] + ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] + ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] + ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] + + ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] + // V5 = Y[7], Y[6], Y[5], Y[4] + + ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] + ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] + ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] + + fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v4.4s}, [Y], #16 + + fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] + fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += +-DA_R * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v5.4s}, [Y], #16 +#else // DOUBLE + ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 + ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] + ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] + + ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 + ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] + ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] + + ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 + + fmla v16.2d, v0.2d, v2.2d + fmla v17.2d, v0.2d, v3.2d + + ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 + + fmla v16.2d, v1.2d, v20.2d + fmla v17.2d, v1.2d, v21.2d + st1 {v16.2d,v17.2d}, [Y], #32 + + fmla v18.2d, v0.2d, v4.2d + fmla v19.2d, v0.2d, v5.2d + fmla v18.2d, v1.2d, v22.2d + fmla v19.2d, v1.2d, v23.2d + st1 {v18.2d,v19.2d}, [Y], #32 +#endif + PRFM PLDL1KEEP, [X, #512] + PRFM PLDL1KEEP, [Y, #512] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] + fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2s}, [Y], INC_Y +#else + ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] + fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] + // Y[iy+1] += +-DA_R * X[ix+1] + fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] + // Y[iy+1] += DA_I * X[ix] + st1 {v3.2d}, [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble zaxpy_kernel_L999 + + mov Y_COPY, Y + + fcmp DA_R, #0.0 + bne .L1 + fcmp DA_I, #0.0 + beq zaxpy_kernel_L999 + +.L1: + INIT + + cmp INC_X, #1 + bne zaxpy_kernel_S_BEGIN + cmp INC_Y, #1 + bne zaxpy_kernel_S_BEGIN + +zaxpy_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq zaxpy_kernel_F1 + + KERNEL_INIT_F4 + +zaxpy_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zaxpy_kernel_F4 + +zaxpy_kernel_F1: + + ands I, N, #3 + ble zaxpy_kernel_L999 + +zaxpy_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zaxpy_kernel_F10 + + mov w0, wzr + ret + +zaxpy_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble zaxpy_kernel_S1 + +zaxpy_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zaxpy_kernel_S4 + +zaxpy_kernel_S1: + + ands I, N, #3 + ble zaxpy_kernel_L999 + +zaxpy_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zaxpy_kernel_S10 + +zaxpy_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/zdot.S b/kernel/arm64/zdot.S new file mode 100644 index 000000000..3e8e3d7d9 --- /dev/null +++ b/kernel/arm64/zdot.S @@ -0,0 +1,302 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#if !defined(DSDOT) +#define REG0 wzr +#define DOTF s0 +#else // DSDOT +#define REG0 xzr +#define DOTF d0 +#endif +#define DOTI s1 +#define TMPX s2 +#define LD1VX {v2.s}[0] +#define TMPY s3 +#define LD1VY {v3.s}[0] +#define TMPVY v3.s[0] +#define SZ 4 +#else +#define REG0 xzr +#define DOTF d0 +#define DOTI d1 +#define TMPX d2 +#define LD1VX {v2.d}[0] +#define TMPY d3 +#define LD1VY {v3.d}[0] +#define TMPVY v3.d[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2 + ins v4.s[0], v2.s[1] // V4 = X[ix+1] +#if !defined(CONJ) + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#else // DOUBLE + ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2 + ins v4.d[0], v2.d[1] // V4 = X[ix+1] +#if !defined(CONJ) + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#endif + +.endm + + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 + ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2 + + fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy] + fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1] + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +#if !defined(CONJ) + fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1] + fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy] +#else + fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1] + fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy] +#endif +#else // DOUBLE + ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 + ld2 {v16.2d, v17.2d}, [Y], #32 + + fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy] + fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1] + ld2 {v4.2d, v5.2d}, [X], #32 + ld2 {v18.2d, v19.2d}, [Y], #32 + fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1] + fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1] + PRFM PLDL1KEEP, [X, #1024] + PRFM PLDL1KEEP, [Y, #1024] +#if !defined(CONJ) + fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1] + fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1] + fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy] + fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy] +#else + fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1] + fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1] + fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy] + fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy] +#endif +#endif + +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v2.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v2.2s + faddp DOTF, v0.2s + ext v3.16b, v1.16b, v1.16b, #8 + fadd v1.2s, v1.2s, v3.2s + faddp DOTI, v1.2s +#else + fadd v0.2d, v0.2d, v20.2d + faddp DOTF, v0.2d + fadd v1.2d, v1.2d, v21.2d + faddp DOTI, v1.2d +#endif +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 + ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] +#if !defined(CONJ) + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#else // DOUBLE + ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 + ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 + ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] +#if !defined(CONJ) + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] + fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#else + fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] + fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] + fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] + fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] +#endif +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + fmov DOTF, REG0 + fmov DOTI, DOTF +#if !defined(DOUBLE) + fmov s20, DOTF + fmov s21, DOTI +#else + fmov d20, DOTF + fmov d21, DOTI +#endif + + cmp N, xzr + ble dot_kernel_L999 + + cmp INC_X, #1 + bne dot_kernel_S_BEGIN + cmp INC_Y, #1 + bne dot_kernel_S_BEGIN + +dot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq dot_kernel_F1 + +dot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne dot_kernel_F4 + + KERNEL_F4_FINALIZE + +dot_kernel_F1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne dot_kernel_F10 + + ret + +dot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble dot_kernel_S1 + +dot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S4 + +dot_kernel_S1: + + ands I, N, #3 + ble dot_kernel_L999 + +dot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne dot_kernel_S10 + +dot_kernel_L999: + + ret + + EPILOGUE diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S new file mode 100644 index 000000000..56a8bbac6 --- /dev/null +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -0,0 +1,1617 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define alpha_save_R x16 +#define alpha_save_I x17 + +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + +#define alpha1_R d14 +#define alphaV1_R v14.d[0] +#define alpha1_I d15 +#define alphaV1_I v15.d[0] + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset -> temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 alpha_save_R +// 17 alpha_save_I +// 18 must save +// 19 must save +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 must save pC00_R, pC01_R +//v17 must save pC00_I, pC01_I +//v18 pC02_R, pC03_R +//v19 pC02_I, pC03_I +//v20 pC10_R, pC11_R +//v21 pC10_I, pC11_I +//v22 pC12_R, pC13_R +//v23 pC12_I, pC13_I +//v24 pC20_R, pC21_R +//v25 pC20_I, pC21_I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v17.2d, v17.2d +#endif + OP_ir v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v19.2d, v19.2d +#endif + OP_ir v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v21.2d, v21.2d +#endif + OP_ir v21.2d, v1.2d, v8.2d[1] + + fmul v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v23.2d, v23.2d +#endif + OP_ir v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v25.2d, v25.2d +#endif + OP_ir v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v27.2d, v27.2d +#endif + OP_ir v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v29.2d, v29.2d +#endif + OP_ir v29.2d, v1.2d, v10.2d[1] + + fmul v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v31.2d, v31.2d +#endif + OP_ir v31.2d, v3.2d, v10.2d[1] + + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + ld2 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + ld2 {v14.2d, v15.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + ld2 {v4.2d, v5.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + ld2 {v6.2d, v7.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + ld2 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + ld2 {v10.2d, v11.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + ld2 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + ld2 {v2.2d, v3.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_E + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro SAVE4x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmla v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmla v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v26.2d, alphaV0_R + fmls v2.2d, v27.2d, alphaV0_I + fmla v3.2d, v26.2d, alphaV1_I + fmla v3.2d, v27.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmla v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v30.2d, alphaV0_R + fmls v6.2d, v31.2d, alphaV0_I + fmla v7.2d, v30.2d, alphaV1_I + fmla v7.2d, v31.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] +.endm + +.macro SAVE2x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmla v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmla v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] + + OP_rr d24, d0, v10.2d[0] + OP_ii d24, d1, v11.2d[0] + OP_ri d25, d0, v11.2d[0] + OP_ir d25, d1, v10.2d[0] + + OP_rr d28, d0, v10.2d[1] + OP_ii d28, d1, v11.2d[1] + OP_ri d29, d0, v11.2d[1] + OP_ir d29, d1, v10.2d[1] +.endm + +.macro SAVE1x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmla d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d24, alphaV0_R + fmls d0, d25, alphaV0_I + fmla d1, d24, alphaV1_I + fmla d1, d25, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d28, alphaV0_R + fmls d4, d29, alphaV0_I + fmla d5, d28, alphaV1_I + fmla d5, d29, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, d16 + fmov d21, d17 + fmov d22, d16 + fmov d23, d17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmla v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.2d, v5.2d}, [pCRow1] + fmla v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmla v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, xzr + fmov d21, xzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] +.endm + +.macro SAVE1x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + ld2 {v4.d, v5.d}[0], [pCRow1] + fmla d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmla d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] +.endm + +.macro SAVE4x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmla v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] +.endm + +.macro SAVE2x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.2d, v1.2d}, [pCRow1] + fmla v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmla v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] +.endm + +.macro SAVE1x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + ld2 {v0.d, v1.d}[0], [pCRow1] + fmla d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmla d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha_save_R, d0 + fmov alpha_save_I, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble zgemm_kernel_L2_BEGIN + +zgemm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + mov pA, origPA // pA = start of A array + +zgemm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble zgemm_kernel_L4_M2_BEGIN + +zgemm_kernel_L4_M4_20: + + mov pB, origPB + asr counterL , origK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt zgemm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 // subtract 2 + ble zgemm_kernel_L4_M4_22a + .align 5 + +zgemm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M4_22 + + +zgemm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b zgemm_kernel_L4_M4_44 + +zgemm_kernel_L4_M4_32: + + tst counterL, #1 + ble zgemm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b zgemm_kernel_L4_M4_44 + + +zgemm_kernel_L4_M4_40: + + INIT4x4 + +zgemm_kernel_L4_M4_44: + + ands counterL , origK, #1 + ble zgemm_kernel_L4_M4_100 + +zgemm_kernel_L4_M4_46: + KERNEL4x4_SUB + +zgemm_kernel_L4_M4_100: + + SAVE4x4 + +zgemm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne zgemm_kernel_L4_M4_20 + +zgemm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L4_M1_BEGIN + +zgemm_kernel_L4_M2_20: + + INIT2x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L4_M2_40 + +zgemm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M2_22 + + +zgemm_kernel_L4_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L4_M2_100 + +zgemm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M2_42 + +zgemm_kernel_L4_M2_100: + + SAVE2x4 + +zgemm_kernel_L4_M2_END: + + +zgemm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L4_END + +zgemm_kernel_L4_M1_20: + + INIT1x4 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L4_M1_40 + +zgemm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M1_22 + + +zgemm_kernel_L4_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L4_M1_100 + +zgemm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L4_M1_42 + +zgemm_kernel_L4_M1_100: + + SAVE1x4 + + +zgemm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + + subs counterJ, counterJ , #1 // j-- + bgt zgemm_kernel_L4_BEGIN + + +/******************************************************************************/ + +zgemm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble zgemm_kernel_L999 + + tst counterJ , #2 + ble zgemm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + + mov pA, origPA // pA = A + + + +zgemm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble zgemm_kernel_L2_M2_BEGIN + +zgemm_kernel_L2_M4_20: + + INIT4x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble zgemm_kernel_L2_M4_40 + .align 5 + +zgemm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M4_22 + + +zgemm_kernel_L2_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M4_100 + +zgemm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M4_42 + +zgemm_kernel_L2_M4_100: + + SAVE4x2 + +zgemm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt zgemm_kernel_L2_M4_20 + + +zgemm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L2_M1_BEGIN + +zgemm_kernel_L2_M2_20: + + INIT2x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble zgemm_kernel_L2_M2_40 + +zgemm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M2_22 + + +zgemm_kernel_L2_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M2_100 + +zgemm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M2_42 + +zgemm_kernel_L2_M2_100: + + SAVE2x2 + +zgemm_kernel_L2_M2_END: + + +zgemm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L2_END + +zgemm_kernel_L2_M1_20: + + INIT1x2 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble zgemm_kernel_L2_M1_40 + +zgemm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M1_22 + + +zgemm_kernel_L2_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L2_M1_100 + +zgemm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L2_M1_42 + +zgemm_kernel_L2_M1_100: + + SAVE1x2 + + +zgemm_kernel_L2_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +zgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble zgemm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + + mov pA, origPA // pA = A + + + +zgemm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble zgemm_kernel_L1_M2_BEGIN + +zgemm_kernel_L1_M4_20: + + INIT4x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M4_40 + .align 5 + +zgemm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M4_22 + + +zgemm_kernel_L1_M4_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M4_100 + +zgemm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M4_42 + +zgemm_kernel_L1_M4_100: + + SAVE4x1 + +zgemm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt zgemm_kernel_L1_M4_20 + + +zgemm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble zgemm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble zgemm_kernel_L1_M1_BEGIN + +zgemm_kernel_L1_M2_20: + + INIT2x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M2_40 + +zgemm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M2_22 + + +zgemm_kernel_L1_M2_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M2_100 + +zgemm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M2_42 + +zgemm_kernel_L1_M2_100: + + SAVE2x1 + +zgemm_kernel_L1_M2_END: + + +zgemm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble zgemm_kernel_L1_END + +zgemm_kernel_L1_M1_20: + + INIT1x1 + + mov pB, origPB + asr counterL , origK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble zgemm_kernel_L1_M1_40 + +zgemm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M1_22 + + +zgemm_kernel_L1_M1_40: + + ands counterL , origK, #7 // counterL = counterL % 8 + ble zgemm_kernel_L1_M1_100 + +zgemm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt zgemm_kernel_L1_M1_42 + +zgemm_kernel_L1_M1_100: + + SAVE1x1 + + +zgemm_kernel_L1_END: + + +zgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S new file mode 100644 index 000000000..9c5ec490c --- /dev/null +++ b/kernel/arm64/zgemv_n.S @@ -0,0 +1,514 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define Y_IPTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ +#define Y_OPTR x13 /* loop Y vector address */ +#define X_PTR x14 /* loop X vector address */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + + +.macro INIT + /********** INIT FOR F4 LOOP **********/ + fmov ALPHA_R_COPY, ALPHA_R + fmov ALPHA_I_COPY, ALPHA_I +#if !defined(DOUBLE) + ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) + ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) + ins v7.d[1], v7.d[0] + ins v8.d[1], v8.d[0] +#else + ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) + ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) +#endif + + /******* INIT FOR F1 AND S1 LOOP ******/ +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) + fneg s2, ALPHA_I + ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) +#endif +#else + ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) + fneg d2, ALPHA_I + ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) +#if !defined(XCONJ) + ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) +#endif +#endif +.endm + +.macro INIT_LOOP + /********** INIT_LOOP FOR F4 LOOP **********/ +#if !defined(DOUBLE) + ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] + ins v10.s[0], v9.s[1] + ins v9.s[1], v9.s[0] // [R(X), R(X)] + ins v10.s[1], v10.s[0] // [I(X), I(X)] + ins v9.d[1], v9.d[0] + ins v10.d[1], v10.d[0] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] + fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] + fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] + fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] +#else + fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] + fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] + fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] + fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] + fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] + ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] + fmul v2.2s, v0.2s, v2.2s + fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] + ins v3.s[0], v2.s[1] +#if !defined(CONJ) +#if !defined(XCONJ) + fneg s4, s3 + ins v3.s[1], v4.s[0] + ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#else + fneg s4, s3 + ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] + ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] + fneg s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#else + fneg s3, s3 + ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] + fneg s4, s2 + ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#else // DOUBLE + + /********** INIT_LOOP FOR F4 LOOP **********/ + ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] + ins v10.d[0], v9.d[1] + ins v9.d[1], v9.d[0] // [R(X), R(X)] + ins v10.d[1], v10.d[0] // [I(X), I(X)] +#if !defined(CONJ) +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] + fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#else // CONJ +#if !defined(XCONJ) + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] + fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] + fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] +#else + fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] + fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] + fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] + fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] + fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] +#endif +#endif // CONJ + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ + ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] + ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] + ins v3.d[0], v2.d[1] // I(TEMP) +#if !defined(CONJ) +#if !defined(XCONJ) + fneg d4, d3 // -I(TEMP) + ins v3.d[1], v4.d[0] + ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#else + fneg d4, d3 // -I(TEMP) + ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] + ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] +#endif +#else // CONJ +#if !defined(XCONJ) + ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] + fneg d4, d2 // -R(TEMP) + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#else + fneg d3, d3 // -I(TEMP) + ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] + fneg d4, d2 // -R(TEMP) + ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] +#endif +#endif // CONJ + +#endif // DOUBLE +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] +#else + fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] + fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] + fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] + fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 + +#else // DOUBLE + + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] + fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] + fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] + fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 + + ld2 {v17.2d, v18.2d}, [A_PTR], #32 + ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#else + fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] + fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] + fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] + fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] +#endif +#endif // CONJ + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 + +#endif + +.endm + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], #8 + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], #8 +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], #16 + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], #16 +#endif +.endm + +.macro INIT_S + lsl INC_Y, INC_Y, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v4.2s}, [A_PTR], #8 + ld1 {v5.2s}, [Y_IPTR], INC_Y + ext v6.8b, v4.8b, v4.8b, #4 + fmla v5.2s, v2.2s, v4.2s + fmla v5.2s, v3.2s, v6.2s + st1 {v5.2s}, [Y_OPTR], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [A_PTR], #16 + ld1 {v5.2d}, [Y_IPTR], INC_Y + ext v6.16b, v4.16b, v4.16b, #8 + fmla v5.2d, v2.2d, v4.2d + fmla v5.2d, v3.2d, v6.2d + st1 {v5.2d}, [Y_OPTR], INC_Y +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + + SAVE_REGS + + cmp N, xzr + ble zgemv_n_kernel_L999 + cmp M, xzr + ble zgemv_n_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_X, INC_X, #SHZ + mov J, N + + INIT + + cmp INC_Y, #1 + bne zgemv_n_kernel_S_BEGIN + +zgemv_n_kernel_F_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_n_kernel_F1 + +zgemv_n_kernel_F4: + + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F4 + +zgemv_n_kernel_F1: + + ands I, M, #3 + ble zgemv_n_kernel_F_END + +zgemv_n_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_n_kernel_F10 + +zgemv_n_kernel_F_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_F_LOOP + + b zgemv_n_kernel_L999 + +zgemv_n_kernel_S_BEGIN: + + INIT_S + +zgemv_n_kernel_S_LOOP: + mov A_PTR, A + mov Y_IPTR, Y + mov Y_OPTR, Y + mov X_PTR, X + add X, X, INC_X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_n_kernel_S1 + +zgemv_n_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S4 + +zgemv_n_kernel_S1: + + ands I, M, #3 + ble zgemv_n_kernel_S_END + +zgemv_n_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_n_kernel_S10 + +zgemv_n_kernel_S_END: + + add A, A, LDA + subs J, J, #1 + bne zgemv_n_kernel_S_LOOP + +zgemv_n_kernel_L999: + RESTORE_REGS + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S new file mode 100644 index 000000000..1f0d698ed --- /dev/null +++ b/kernel/arm64/zgemv_t.S @@ -0,0 +1,448 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define M x0 /* Y vector length */ +#define N x1 /* X vector length */ +#define A x3 /* A vector address */ +#define LDA x4 /* A stride */ +#define X x5 /* X vector address */ +#define INC_X x6 /* X stride */ +#define Y x7 /* Y vector address */ +#define INC_Y x2 /* Y stride */ +#define A_PTR x9 /* loop A vector address */ +#define X_PTR x10 /* loop Y vector address */ +#define J x11 /* loop variable */ +#define I x12 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define ALPHA_R s0 +#define ALPHA_I s1 +#define ALPHA_R_COPY s7 +#define ALPHA_I_COPY s8 +#define SHZ 3 +#else +#define ALPHA_R d0 +#define ALPHA_I d1 +#define ALPHA_R_COPY d7 +#define ALPHA_I_COPY d8 +#define SHZ 4 +#endif + +/******************************************************************************/ + + +.macro SAVE_REGS + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] +.endm + +.macro RESTORE_REGS + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) +.endm + +.macro INIT +#if !defined(XCONJ) +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R + fneg s2, ALPHA_I + ins v1.s[1], v2.s[0] + ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I +#else + ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R + fneg d2, ALPHA_I + ins v1.d[1], v2.d[0] + ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I +#endif +#else // XCONJ +#if !defined(DOUBLE) + fneg s2, ALPHA_R + ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I +#else + fneg d2, ALPHA_R + ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R + ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I +#endif +#endif +.endm + +.macro INIT_LOOP + fmov d9, xzr // TEMP_R = [0, 0] + fmov d10, xzr // TEMP_I = [0, 0] +#if !defined(DOUBLE) +#else + fmov d15, xzr // TEMP_R = [0, 0] + fmov d16, xzr // TEMP_I = [0, 0] +#endif + + fmov d2, xzr // TEMP = [0, 0] +.endm + +.macro KERNEL_F4 +#if !defined(DOUBLE) + + ld2 {v11.4s, v12.4s}, [X_PTR], #32 + ld2 {v13.4s, v14.4s}, [A_PTR], #32 + +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] + fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#else + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] + fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] + fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] + fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] + fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] +#else + fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] + fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] + fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] + fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] +#endif +#endif // CONJ + +#else // DOUBLE + ld2 {v11.2d, v12.2d}, [X_PTR], #32 + ld2 {v13.2d, v14.2d}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #512] +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] + fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#else + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] + fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] + fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] + fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] + fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] +#else + fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] + fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] + fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] + fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] +#endif +#endif // CONJ + ld2 {v17.2d, v18.2d}, [X_PTR], #32 + ld2 {v19.2d, v20.2d}, [A_PTR], #32 + prfm PLDL1STRM, [A_PTR, #512] +#if !defined(CONJ) +#if !defined(XCONJ) + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#else + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#endif +#else // CONJ +#if !defined(XCONJ) + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#else + fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] + fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] + fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] + fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] +#endif +#endif // CONJ +#endif //DOUBLE +.endm + +.macro KERNEL_F4_FINALIZE +#if !defined(DOUBLE) + ext v21.16b, v9.16b, v9.16b, #8 + fadd v9.2s, v9.2s, v21.2s + faddp s9, v9.2s + + ext v21.16b, v10.16b, v10.16b, #8 + fadd v10.2s, v10.2s, v21.2s + faddp s10, v10.2s + + ins v2.s[0], v9.s[0] + ins v2.s[1], v10.s[0] +#else + fadd v9.2d, v9.2d, v15.2d + fadd v10.2d, v10.2d, v16.2d + + faddp d9, v9.2d + faddp d10, v10.2d + + ins v2.d[0], v9.d[0] + ins v2.d[1], v10.d[0] +#endif +.endm + + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] + fneg s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] + fneg d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +.macro INIT_S + lsl INC_X, INC_X, #SHZ +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] + ld1 {v5.s}[0], [A_PTR], #4 // A1 + ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] + fneg s16, s5 + ins v5.s[1], v16.s[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] +#endif + ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] + fmla v2.2s, v4.2s, v6.2s + fmla v2.2s, v5.2s, v7.2s +#else // DOUBLE + ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] + ld1 {v5.d}[0], [A_PTR], #8 // A1 + ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] + fneg d16, d5 + ins v5.d[1], v16.d[0] // [-A1, A1] +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] +#endif + ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] + fmla v2.2d, v4.2d, v6.2d + fmla v2.2d, v5.2d, v7.2d +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + ldr INC_Y, [sp] + SAVE_REGS + + cmp N, xzr + ble zgemv_t_kernel_L999 + cmp M, xzr + ble zgemv_t_kernel_L999 + + lsl LDA, LDA, #SHZ + lsl INC_Y, INC_Y, #SHZ + mov J, N + + INIT + + cmp INC_X, #1 + bne zgemv_t_kernel_S_BEGIN + +zgemv_t_kernel_F_LOOP: + + mov A_PTR, A + mov X_PTR, X + + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + beq zgemv_t_kernel_F1 + +zgemv_t_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zgemv_t_kernel_F4 + + KERNEL_F4_FINALIZE + +zgemv_t_kernel_F1: + + ands I, M, #3 + ble zgemv_t_kernel_F_END + +zgemv_t_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zgemv_t_kernel_F10 + +zgemv_t_kernel_F_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_F_LOOP + + b zgemv_t_kernel_L999 + +zgemv_t_kernel_S_BEGIN: + + INIT_S + +zgemv_t_kernel_S_LOOP: + + mov A_PTR, A + mov X_PTR, X + INIT_LOOP + + asr I, M, #2 + cmp I, xzr + ble zgemv_t_kernel_S1 + +zgemv_t_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S4 + +zgemv_t_kernel_S1: + + ands I, M, #3 + ble zgemv_t_kernel_S_END + +zgemv_t_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zgemv_t_kernel_S10 + +zgemv_t_kernel_S_END: + +#if !defined(DOUBLE) + ld1 {v4.2s}, [Y] + ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] + fmla v4.2s, v0.2s, v2.2s + fmla v4.2s, v1.2s, v3.2s + st1 {v4.2s}, [Y], INC_Y +#else // DOUBLE + ld1 {v4.2d}, [Y] + ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] + fmla v4.2d, v0.2d, v2.2d + fmla v4.2d, v1.2d, v3.2d + st1 {v4.2d}, [Y], INC_Y +#endif + + add A, A, LDA + subs J, J, #1 + bne zgemv_t_kernel_S_LOOP + +zgemv_t_kernel_L999: + RESTORE_REGS + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S new file mode 100644 index 000000000..0c3d264e4 --- /dev/null +++ b/kernel/arm64/znrm2.S @@ -0,0 +1,228 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define TMPF s6 +#define SSQ s0 +#define TMPVF {v6.s}[0] +#define SZ 4 +#else +#define TMPF d6 +#define SSQ d0 +#define TMPVF {v6.d}[0] +#define SZ 8 +#endif + +/******************************************************************************/ + +.macro KERNEL_F1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], #8 + fmul v1.2s, v1.2s, v1.2s + faddp TMPF, v1.2s + fadd SSQ, SSQ, TMPF +#else + ld1 {v1.2d}, [X], #16 + fmul v1.2d, v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SSQ, SSQ, TMPF +#endif +.endm + +.macro KERNEL_F8 +#if !defined(DOUBLE) + ld1 {v1.4s, v2.4s}, [X], #32 + fmla v0.4s, v1.4s, v1.4s + fmla v5.4s, v2.4s, v2.4s + ld1 {v3.4s,v4.4s}, [X], #32 + fmla v0.4s, v3.4s, v3.4s + fmla v5.4s, v4.4s, v4.4s + PRFM PLDL1KEEP, [X, #1024] +#else // DOUBLE + ld1 {v1.2d, v2.2d}, [X], #32 + fmla v0.2d, v1.2d, v1.2d + fmla v5.2d, v2.2d, v2.2d + ld1 {v3.2d, v4.2d}, [X], #32 + fmla v0.2d, v3.2d, v3.2d + fmla v5.2d, v4.2d, v4.2d + + ld1 {v16.2d, v17.2d}, [X], #32 + fmla v0.2d, v16.2d, v16.2d + fmla v5.2d, v17.2d, v17.2d + ld1 {v18.2d, v19.2d}, [X], #32 + fmla v0.2d, v18.2d, v18.2d + fmla v5.2d, v19.2d, v19.2d +#endif +.endm + +.macro nrm2_kernel_F8_FINALIZE +#if !defined(DOUBLE) + fadd v0.4s, v0.4s, v5.4s + ext v1.16b, v0.16b, v0.16b, #8 + fadd v0.2s, v0.2s, v1.2s + faddp SSQ, v0.2s +#else + fadd v0.2d, v0.2d, v5.2d + faddp SSQ, v0.2d +#endif +.endm + +.macro INIT_S +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + ld1 {v1.2s}, [X], INC_X + fmul v1.2s, v1.2s, v1.2s + faddp SSQ, v1.2s +#else + lsl INC_X, INC_X, #4 + ld1 {v1.2d}, [X], INC_X + fmul v1.2d, v1.2d, v1.2d + faddp SSQ, v1.2d +#endif +.endm + +.macro KERNEL_S1 +#if !defined(DOUBLE) + ld1 {v1.2s}, [X], INC_X + fmul v1.2s, v1.2s, v1.2s + faddp TMPF, v1.2s + fadd SSQ, SSQ, TMPF +#else + ld1 {v1.2d}, [X], INC_X + fmul v1.2d, v1.2d, v1.2d + faddp TMPF, v1.2d + fadd SSQ, SSQ, TMPF +#endif +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + +#if !defined(DOUBLE) + fmov SSQ, wzr + fmov s5, SSQ +#else + fmov SSQ, xzr + fmov d5, SSQ +#endif + + cmp N, xzr + ble nrm2_kernel_zero + cmp INC_X, xzr + ble nrm2_kernel_zero + cmp INC_X, #1 + bne nrm2_kernel_S_BEGIN + +nrm2_kernel_F_BEGIN: + + asr I, N, #3 + cmp I, xzr + beq nrm2_kernel_F1_INIT + +nrm2_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne nrm2_kernel_F8 + + nrm2_kernel_F8_FINALIZE + +nrm2_kernel_F1: + + ands I, N, #7 + ble nrm2_kernel_L999 + +nrm2_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne nrm2_kernel_F10 + + b nrm2_kernel_L999 + +nrm2_kernel_F1_INIT: + + b nrm2_kernel_F1 + +nrm2_kernel_S_BEGIN: + + INIT_S + + subs N, N, #1 + ble nrm2_kernel_L999 + + asr I, N, #2 + cmp I, xzr + ble nrm2_kernel_S1 + +nrm2_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S4 + +nrm2_kernel_S1: + + ands I, N, #3 + ble nrm2_kernel_L999 + +nrm2_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne nrm2_kernel_S10 + +nrm2_kernel_L999: + fsqrt SSQ, SSQ + ret + +nrm2_kernel_zero: + ret + + EPILOGUE diff --git a/kernel/arm64/zrot.S b/kernel/arm64/zrot.S new file mode 100644 index 000000000..90f138a19 --- /dev/null +++ b/kernel/arm64/zrot.S @@ -0,0 +1,256 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x1 /* X vector address */ +#define INC_X x2 /* X stride */ +#define Y x3 /* Y vector address */ +#define INC_Y x4 /* Y stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define C s0 /* scale input value */ +#define S s1 /* scale input value */ +#else +#define C d0 /* scale input value */ +#define S d1 /* scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // [C, C] + ins v1.s[1], v1.s[0] // [S, S] +#else + ins v0.d[1], v0.d[0] // [C, C] + ins v1.d[1], v1.d[0] // [S, S] +#endif + +.endm + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] + ld1 {v3.2s}, [Y] + fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] + fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] + fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2s}, [X], #8 + st1 {v5.2s}, [Y], #8 +#else + ld1 {v2.2d}, [X] + ld1 {v3.2d}, [Y] + fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] + fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] + fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2d}, [X], #16 + st1 {v5.2d}, [Y], #16 +#endif + +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + ins v0.d[1], v0.d[0] // [C, C, C, C] + ins v1.d[1], v1.d[0] // [S, S, S, S] +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v2.4s, v3.4s}, [X] + ld1 {v4.4s, v5.4s}, [Y] + fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 + fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4 + fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.4s,v7.4s}, [X], #32 + st1 {v16.4s,v17.4s}, [Y], #32 +#else // DOUBLE + ld1 {v2.2d, v3.2d}, [X] + ld1 {v4.2d, v5.2d}, [Y] + fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 + fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.2d,v7.2d}, [X], #32 + st1 {v16.2d,v17.2d}, [Y], #32 + ld1 {v2.2d, v3.2d}, [X] + ld1 {v4.2d, v5.2d}, [Y] + fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 + fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 + fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 + fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 + fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 + fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 + fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 + fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 + st1 {v6.2d,v7.2d}, [X], #32 + st1 {v16.2d,v17.2d}, [Y], #32 +#endif + +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 + lsl INC_Y, INC_Y, #3 +#else + lsl INC_X, INC_X, #4 + lsl INC_Y, INC_Y, #4 +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] + ld1 {v3.2s}, [Y] + fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] + fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] + fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2s}, [X], INC_X + st1 {v5.2s}, [Y], INC_Y +#else + ld1 {v2.2d}, [X] + ld1 {v3.2d}, [Y] + fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] + fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] + fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] + fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] + st1 {v4.2d}, [X], INC_X + st1 {v5.2d}, [Y], INC_Y +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble rot_kernel_L999 + + INIT + + cmp INC_X, #1 + bne rot_kernel_S_BEGIN + cmp INC_Y, #1 + bne rot_kernel_S_BEGIN + +rot_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq rot_kernel_F1 + + KERNEL_INIT_F4 + +rot_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne rot_kernel_F4 + +rot_kernel_F1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne rot_kernel_F10 + + mov w0, wzr + ret + +rot_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble rot_kernel_S1 + +rot_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S4 + +rot_kernel_S1: + + ands I, N, #3 + ble rot_kernel_L999 + +rot_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne rot_kernel_S10 + +rot_kernel_L999: + + mov w0, wzr + ret diff --git a/kernel/arm64/zscal.S b/kernel/arm64/zscal.S new file mode 100644 index 000000000..db2c3506a --- /dev/null +++ b/kernel/arm64/zscal.S @@ -0,0 +1,274 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define N x0 /* vector length */ +#define X x3 /* X vector address */ +#define INC_X x4 /* X stride */ +#define I x5 /* loop variable */ + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +#if !defined(DOUBLE) +#define DA_R s0 /* real scale input value */ +#define DA_I s1 /* imaginary scale input value */ +#else +#define DA_R d0 /* real scale input value */ +#define DA_I d1 /* imaginary scale input value */ +#endif + +/******************************************************************************/ + +.macro INIT + +#if !defined(DOUBLE) + ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R + fneg s2, DA_I + ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I + ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I +#else + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R + fneg d2, DA_I + ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I + ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I +#endif + +.endm + +.macro KERNEL_F1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 + fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 + fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v2.2s}, [X], #8 +#else + ld1 {v2.2d}, [X] // X1, X0 + ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 + fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 + fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v2.2d}, [X], #16 +#endif + +.endm + +.macro KERNEL_INIT_F4 + +#if !defined(DOUBLE) + // Replicate the lower 2 floats into the upper 2 slots + ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R + ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I +#endif + +.endm + +.macro KERNEL_F4 + +#if !defined(DOUBLE) + ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] + // V3 = X[7], X[6], X[5], X[4] + + ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] + ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] + ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] + fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] + // X'[ix+1] += DA_R * X[ix+1] + fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] + // X'[ix+1] += DA_I * X[ix] + + ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] + ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] + ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] + fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] + // X'[ix+1] += DA_R * X[ix+1] + fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] + // X'[ix+1] += DA_I * X[ix] + + st1 {v2.4s,v3.4s}, [X], #32 +#else // DOUBLE + ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 + ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] + ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] + ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] + ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] + + fmul v2.2d, v0.2d, v2.2d + fmla v2.2d, v1.2d, v20.2d + + fmul v3.2d, v0.2d, v3.2d + fmla v3.2d, v1.2d, v21.2d + st1 {v2.2d,v3.2d}, [X], #32 + + fmul v4.2d, v0.2d, v4.2d + fmla v4.2d, v1.2d, v22.2d + + fmul v5.2d, v0.2d, v5.2d + fmla v5.2d, v1.2d, v23.2d + st1 {v4.2d,v5.2d}, [X], #32 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro INIT_S + +#if !defined(DOUBLE) + lsl INC_X, INC_X, #3 +#else + lsl INC_X, INC_X, #4 +#endif + +.endm + +.macro KERNEL_S1 + +#if !defined(DOUBLE) + ld1 {v2.2s}, [X] // X1, X0 + ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 + fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 + fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v2.2s}, [X], INC_X +#else + ld1 {v2.2d}, [X] // X1, X0 + ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 + fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 + fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 + st1 {v2.2d}, [X], INC_X +#endif + +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + cmp N, xzr + ble zscal_kernel_L999 + + fcmp DA_R, #0.0 + bne zscal_kernel_1 + + fcmp DA_I, #0.0 + beq zscal_kernel_zero + + // TODO: special case DA_R == 0 && DA_I != 0 + +zscal_kernel_1: + + // TODO: special case DA_R != 0 && DA_I == 0 + + INIT + + cmp INC_X, #1 + bne zscal_kernel_S_BEGIN + +zscal_kernel_F_BEGIN: + + asr I, N, #2 + cmp I, xzr + beq zscal_kernel_F1 + + KERNEL_INIT_F4 + +zscal_kernel_F4: + + KERNEL_F4 + + subs I, I, #1 + bne zscal_kernel_F4 + +zscal_kernel_F1: + + ands I, N, #3 + ble zscal_kernel_L999 + +zscal_kernel_F10: + + KERNEL_F1 + + subs I, I, #1 + bne zscal_kernel_F10 + + mov w0, wzr + ret + +zscal_kernel_S_BEGIN: + + INIT_S + + asr I, N, #2 + cmp I, xzr + ble zscal_kernel_S1 + +zscal_kernel_S4: + + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + KERNEL_S1 + + subs I, I, #1 + bne zscal_kernel_S4 + +zscal_kernel_S1: + + ands I, N, #3 + ble zscal_kernel_L999 + +zscal_kernel_S10: + + KERNEL_S1 + + subs I, I, #1 + bne zscal_kernel_S10 + +zscal_kernel_L999: + + mov w0, wzr + ret + +zscal_kernel_zero: + + INIT_S + +zscal_kernel_Z1: + + stp DA_R, DA_I, [X] + add X, X, INC_X + subs N, N, #1 + bne zscal_kernel_Z1 + + mov w0, wzr + ret + + EPILOGUE diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S new file mode 100644 index 000000000..4fbb7fc1c --- /dev/null +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -0,0 +1,1893 @@ +/******************************************************************************* +Copyright (c) 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 s1 X3 x4 x5 x6 x7 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT alpha1,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset */ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define offset x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 +#define pA x15 +#define alpha_save_R x16 +#define alpha_save_I x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 + +#define alpha0_R d10 +#define alphaV0_R v10.d[0] +#define alpha0_I d11 +#define alphaV0_I v11.d[0] + +#define alpha1_R d14 +#define alphaV1_R v14.d[0] +#define alpha1_I d15 +#define alphaV1_I v15.d[0] + + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmla +#define OP_ir fmla +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmls +#define OP_ir fmla +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define OP_rr fmla +#define OP_ii fmla +#define OP_ri fmla +#define OP_ir fmls +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define OP_rr fmla +#define OP_ii fmls +#define OP_ri fmls +#define OP_ir fmls +#endif + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 offset +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 pA +// 16 alpha_save_R +// 17 alpha_save_I +// 18 must save temp +// 19 must save tempOffset +// 20 must save tempK +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA_R -> pA00_R, pA01_R +//v01 ALPHA_I -> pA00_I, pA01_I +//v02 pA02_R, pA03_R +//v03 pA02_I, pA03_I +//v04 pA10_R, pA11_R +//v05 pA10_I, pA11_I +//v06 pA12_R, pA13_R +//v07 pA12_I, pA13_I +//v08 must save pB00_R, pB01_R +//v09 must save pB00_I, pB01_I +//v10 must save pB02_R, pB03_R OR ALPHA0_R +//v11 must save pB02_I, pB03_I OR ALPHA0_I +//v12 must save pB10_R, pB11_R +//v13 must save pB10_I, pB11_I +//v14 must save pB12_R, pB13_R OR ALPHA1_R +//v15 must save pB12_I, pB13_I OR ALPHA1_R +//v16 must save pC00_R, pC01_R +//v17 must save pC00_I, pC01_I +//v18 pC02_R, pC03_R +//v19 pC02_I, pC03_I +//v20 pC10_R, pC11_R +//v21 pC10_I, pC11_I +//v22 pC12_R, pC13_R +//v23 pC12_I, pC13_I +//v24 pC20_R, pC21_R +//v25 pC20_I, pC21_I +//v26 pC22_R, pC23_R +//v27 pC22_I, pC23_I +//v28 pC30_R, pC31_R +//v29 pC30_I, pC31_I +//v30 pC32_R, pC33_R +//v31 pC32_I, pC33_I + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INIT4x4 + fmov d16, xzr + fmov d17, d16 + fmov d18, d17 + fmov d19, d16 + fmov d20, d17 + fmov d21, d16 + fmov d22, d17 + fmov d23, d16 + fmov d24, d17 + fmov d25, d16 + fmov d26, d17 + fmov d27, d16 + fmov d28, d17 + fmov d29, d16 + fmov d30, d17 + fmov d31, d16 +.endm + +.macro KERNEL4x4_I + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + fmul v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + fmul v17.2d, v0.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v17.2d, v17.2d +#endif + OP_ir v17.2d, v1.2d, v8.2d[0] + + fmul v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + fmul v19.2d, v2.2d, v9.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v19.2d, v19.2d +#endif + OP_ir v19.2d, v3.2d, v8.2d[0] + + fmul v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + fmul v21.2d, v0.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v21.2d, v21.2d +#endif + OP_ir v21.2d, v1.2d, v8.2d[1] + + fmul v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + fmul v23.2d, v2.2d, v9.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v23.2d, v23.2d +#endif + OP_ir v23.2d, v3.2d, v8.2d[1] + + fmul v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + fmul v25.2d, v0.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v25.2d, v25.2d +#endif + OP_ir v25.2d, v1.2d, v10.2d[0] + + fmul v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + fmul v27.2d, v2.2d, v11.2d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v27.2d, v27.2d +#endif + OP_ir v27.2d, v3.2d, v10.2d[0] + + fmul v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + fmul v29.2d, v0.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v29.2d, v29.2d +#endif + OP_ir v29.2d, v1.2d, v10.2d[1] + + fmul v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + fmul v31.2d, v2.2d, v11.2d[1] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + fneg v31.2d, v31.2d +#endif + OP_ir v31.2d, v3.2d, v10.2d[1] + + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 +.endm + +.macro KERNEL4x4_M1 + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + ld2 {v12.2d, v13.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + ld2 {v14.2d, v15.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + ld2 {v4.2d, v5.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + ld2 {v6.2d, v7.2d} , [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro KERNEL4x4_M2 + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + ld2 {v8.2d, v9.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + ld2 {v10.2d, v11.2d}, [pB] // For next round + add pB, pB, #32 + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + ld2 {v0.2d, v1.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + ld2 {v2.2d, v3.2d}, [pA] // For next round + add pA, pA, #32 + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + prfm PLDL1KEEP, [pA, #512] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + prfm PLDL1KEEP, [pB, #512] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_E + OP_rr v16.2d, v4.2d, v12.2d[0] + OP_ii v16.2d, v5.2d, v13.2d[0] + OP_ri v17.2d, v4.2d, v13.2d[0] + OP_ir v17.2d, v5.2d, v12.2d[0] + + OP_rr v18.2d, v6.2d, v12.2d[0] + OP_ii v18.2d, v7.2d, v13.2d[0] + OP_ri v19.2d, v6.2d, v13.2d[0] + OP_ir v19.2d, v7.2d, v12.2d[0] + + OP_rr v20.2d, v4.2d, v12.2d[1] + OP_ii v20.2d, v5.2d, v13.2d[1] + OP_ri v21.2d, v4.2d, v13.2d[1] + OP_ir v21.2d, v5.2d, v12.2d[1] + + OP_rr v22.2d, v6.2d, v12.2d[1] + OP_ii v22.2d, v7.2d, v13.2d[1] + OP_ri v23.2d, v6.2d, v13.2d[1] + OP_ir v23.2d, v7.2d, v12.2d[1] + + OP_rr v24.2d, v4.2d, v14.2d[0] + OP_ii v24.2d, v5.2d, v15.2d[0] + OP_ri v25.2d, v4.2d, v15.2d[0] + OP_ir v25.2d, v5.2d, v14.2d[0] + + OP_rr v26.2d, v6.2d, v14.2d[0] + OP_ii v26.2d, v7.2d, v15.2d[0] + OP_ri v27.2d, v6.2d, v15.2d[0] + OP_ir v27.2d, v7.2d, v14.2d[0] + + OP_rr v28.2d, v4.2d, v14.2d[1] + OP_ii v28.2d, v5.2d, v15.2d[1] + OP_ri v29.2d, v4.2d, v15.2d[1] + OP_ir v29.2d, v5.2d, v14.2d[1] + + OP_rr v30.2d, v6.2d, v14.2d[1] + OP_ii v30.2d, v7.2d, v15.2d[1] + OP_ri v31.2d, v6.2d, v15.2d[1] + OP_ir v31.2d, v7.2d, v14.2d[1] +.endm + +.macro KERNEL4x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v26.2d, v2.2d, v10.2d[0] + OP_ii v26.2d, v3.2d, v11.2d[0] + OP_ri v27.2d, v2.2d, v11.2d[0] + OP_ir v27.2d, v3.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] + + OP_rr v30.2d, v2.2d, v10.2d[1] + OP_ii v30.2d, v3.2d, v11.2d[1] + OP_ri v31.2d, v2.2d, v11.2d[1] + OP_ir v31.2d, v3.2d, v10.2d[1] +.endm + +.macro SAVE4x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmul v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + fmul v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmul v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v26.2d, alphaV0_R + fmls v2.2d, v27.2d, alphaV0_I + fmul v3.2d, v26.2d, alphaV1_I + fmla v3.2d, v27.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmul v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v30.2d, alphaV0_R + fmls v6.2d, v31.2d, alphaV0_I + fmul v7.2d, v30.2d, alphaV1_I + fmla v7.2d, v31.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL2x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v24.2d, v0.2d, v10.2d[0] + OP_ii v24.2d, v1.2d, v11.2d[0] + OP_ri v25.2d, v0.2d, v11.2d[0] + OP_ir v25.2d, v1.2d, v10.2d[0] + + OP_rr v28.2d, v0.2d, v10.2d[1] + OP_ii v28.2d, v1.2d, v11.2d[1] + OP_ri v29.2d, v0.2d, v11.2d[1] + OP_ir v29.2d, v1.2d, v10.2d[1] +.endm + +.macro SAVE2x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v0.2d, v24.2d, alphaV0_R + fmls v0.2d, v25.2d, alphaV0_I + fmul v1.2d, v24.2d, alphaV1_I + fmla v1.2d, v25.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v28.2d, alphaV0_R + fmls v4.2d, v29.2d, alphaV0_I + fmul v5.2d, v28.2d, alphaV1_I + fmla v5.2d, v29.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x4 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 + fmov d24, d16 + fmov d25, d17 + fmov d28, d16 + fmov d29, d17 +.endm + +.macro KERNEL1x4_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] + + OP_rr d24, d0, v10.2d[0] + OP_ii d24, d1, v11.2d[0] + OP_ri d25, d0, v11.2d[0] + OP_ir d25, d1, v10.2d[0] + + OP_rr d28, d0, v10.2d[1] + OP_ii d28, d1, v11.2d[1] + OP_ri d29, d0, v11.2d[1] + OP_ir d29, d1, v10.2d[1] +.endm + +.macro SAVE1x4 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmul d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d0, d24, alphaV0_R + fmls d0, d25, alphaV0_I + fmul d1, d24, alphaV1_I + fmla d1, d25, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d28, alphaV0_R + fmls d4, d29, alphaV0_I + fmul d5, d28, alphaV1_I + fmla d5, d29, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x2 + fmov d16, xzr + fmov d17, xzr + fmov d18, d16 + fmov d19, d17 + fmov d20, d16 + fmov d21, d17 + fmov d22, d16 + fmov d23, d17 +.endm + +.macro KERNEL4x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v18.2d, v2.2d, v8.2d[0] + OP_ii v18.2d, v3.2d, v9.2d[0] + OP_ri v19.2d, v2.2d, v9.2d[0] + OP_ir v19.2d, v3.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] + + OP_rr v22.2d, v2.2d, v8.2d[1] + OP_ii v22.2d, v3.2d, v9.2d[1] + OP_ri v23.2d, v2.2d, v9.2d[1] + OP_ir v23.2d, v3.2d, v8.2d[1] +.endm + +.macro SAVE4x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R + fmls v6.2d, v23.2d, alphaV0_I + fmul v7.2d, v22.2d, alphaV1_I + fmla v7.2d, v23.2d, alphaV1_R + st2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, d16 + fmov d21, d17 +.endm + +.macro KERNEL2x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.2d[0] + OP_ii v16.2d, v1.2d, v9.2d[0] + OP_ri v17.2d, v0.2d, v9.2d[0] + OP_ir v17.2d, v1.2d, v8.2d[0] + + OP_rr v20.2d, v0.2d, v8.2d[1] + OP_ii v20.2d, v1.2d, v9.2d[1] + OP_ri v21.2d, v0.2d, v9.2d[1] + OP_ir v21.2d, v1.2d, v8.2d[1] +.endm + +.macro SAVE2x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul v4.2d, v20.2d, alphaV0_R + fmls v4.2d, v21.2d, alphaV0_I + fmul v5.2d, v20.2d, alphaV1_I + fmla v5.2d, v21.2d, alphaV1_R + st2 {v4.2d, v5.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 +.endm + +/******************************************************************************/ + +.macro INIT1x2 + fmov d16, xzr + fmov d17, xzr + fmov d20, xzr + fmov d21, xzr +.endm + +.macro KERNEL1x2_SUB + ld2 {v8.2d, v9.2d}, [pB] + add pB, pB, #32 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.2d[0] + OP_ii d16, d1, v9.2d[0] + OP_ri d17, d0, v9.2d[0] + OP_ir d17, d1, v8.2d[0] + + OP_rr d20, d0, v8.2d[1] + OP_ii d20, d1, v9.2d[1] + OP_ri d21, d0, v9.2d[1] + OP_ir d21, d1, v8.2d[1] +.endm + +.macro SAVE1x2 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow1, pCRow1, LDC + + fmul d4, d20, alphaV0_R + fmls d4, d21, alphaV0_I + fmul d5, d20, alphaV1_I + fmla d5, d21, alphaV1_R + st2 {v4.d, v5.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************/ + +.macro INIT4x1 + fmov d16, xzr + fmov d17, d16 + fmov d18, d16 + fmov d19, d17 +.endm + +.macro KERNEL4x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] +.endm + +.macro SAVE4x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + add pCRow2, pCRow1, #32 + fmul v2.2d, v18.2d, alphaV0_R + fmls v2.2d, v19.2d, alphaV0_I + fmul v3.2d, v18.2d, alphaV1_I + fmla v3.2d, v19.2d, alphaV1_R + st2 {v2.2d, v3.2d}, [pCRow2] + + add pCRow0, pCRow0, #64 +.endm + +/******************************************************************************/ + +.macro INIT2x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL2x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 + + OP_rr v16.2d, v0.2d, v8.d[0] + OP_ii v16.2d, v1.2d, v9.d[0] + OP_ri v17.2d, v0.2d, v9.d[0] + OP_ir v17.2d, v1.2d, v8.d[0] +.endm + +.macro SAVE2x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul v0.2d, v16.2d, alphaV0_R + fmls v0.2d, v17.2d, alphaV0_I + fmul v1.2d, v16.2d, alphaV1_I + fmla v1.2d, v17.2d, alphaV1_R + st2 {v0.2d, v1.2d}, [pCRow1] + + add pCRow0, pCRow0, #32 + +.endm + +/******************************************************************************/ + +.macro INIT1x1 + fmov d16, xzr + fmov d17, xzr +.endm + +.macro KERNEL1x1_SUB + ld2 {v8.d, v9.d}[0], [pB] + add pB, pB, #16 + ld2 {v0.d, v1.d}[0], [pA] + add pA, pA, #16 + + OP_rr d16, d0, v8.d[0] + OP_ii d16, d1, v9.d[0] + OP_ri d17, d0, v9.d[0] + OP_ir d17, d1, v8.d[0] +.endm + +.macro SAVE1x1 + fmov alpha0_R, alpha_save_R + fmov alpha0_I, alpha_save_I + fmov alpha1_R, alpha0_R + fmov alpha1_I, alpha0_I + + mov pCRow1, pCRow0 + + fmul d0, d16, alphaV0_R + fmls d0, d17, alphaV0_I + fmul d1, d16, alphaV1_I + fmla d1, d17, alphaV1_R + st2 {v0.d, v1.d}[0], [pCRow1] + + add pCRow0, pCRow0, #16 +.endm + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha_save_R, d0 + fmov alpha_save_I, d1 + + lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 + +#if !defined(LEFT) + neg tempOffset, offset +#endif + + mov pB, origPB + + mov counterJ, origN + asr counterJ, counterJ, #2 // J = J / 4 + cmp counterJ, #0 + ble ztrmm_kernel_L2_BEGIN + +ztrmm_kernel_L4_BEGIN: + mov pCRow0, pC // pCRow0 = C + add pC, pC, LDC, lsl #2 + +#if defined(LEFT) + mov tempOffset, offset +#endif + mov pA, origPA // pA = start of A array + +ztrmm_kernel_L4_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ztrmm_kernel_L4_M2_BEGIN + +ztrmm_kernel_L4_M4_20: + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pB, pB, temp + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ztrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ztrmm_kernel_L4_M4_22a + .align 5 + +ztrmm_kernel_L4_M4_22: + + KERNEL4x4_M1 + KERNEL4x4_M2 + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M4_22 + + +ztrmm_kernel_L4_M4_22a: + + KERNEL4x4_M1 + KERNEL4x4_E + + b ztrmm_kernel_L4_M4_44 + +ztrmm_kernel_L4_M4_32: + + tst counterL, #1 + ble ztrmm_kernel_L4_M4_40 + + KERNEL4x4_I + KERNEL4x4_E + + b ztrmm_kernel_L4_M4_44 + + +ztrmm_kernel_L4_M4_40: + + INIT4x4 + +ztrmm_kernel_L4_M4_44: + + ands counterL , tempK, #1 + ble ztrmm_kernel_L4_M4_100 + +ztrmm_kernel_L4_M4_46: + KERNEL4x4_SUB + +ztrmm_kernel_L4_M4_100: + + SAVE4x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L4_M4_END: + subs counterI, counterI, #1 + bne ztrmm_kernel_L4_M4_20 + +ztrmm_kernel_L4_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L4_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L4_M1_BEGIN + +ztrmm_kernel_L4_M2_20: + + INIT2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pA, pA, temp + lsl temp, tempOffset, #6 + add pB, pB, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L4_M2_40 + +ztrmm_kernel_L4_M2_22: + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M2_22 + + +ztrmm_kernel_L4_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L4_M2_100 + +ztrmm_kernel_L4_M2_42: + + KERNEL2x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M2_42 + +ztrmm_kernel_L4_M2_100: + + SAVE2x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L4_M2_END: + + +ztrmm_kernel_L4_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L4_END + +ztrmm_kernel_L4_M1_20: + + INIT1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #6 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #4 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L4_M1_40 + +ztrmm_kernel_L4_M1_22: + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M1_22 + + +ztrmm_kernel_L4_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L4_M1_100 + +ztrmm_kernel_L4_M1_42: + + KERNEL1x4_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L4_M1_42 + +ztrmm_kernel_L4_M1_100: + + SAVE1x4 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #4 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #6 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +ztrmm_kernel_L4_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 4 * 8 * 2 + +#if !defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + + subs counterJ, counterJ , #1 // j-- + bgt ztrmm_kernel_L4_BEGIN + + +/******************************************************************************/ + +ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction + + mov counterJ , origN + tst counterJ , #3 + ble ztrmm_kernel_L999 // error, N was less than 4? + + tst counterJ , #2 + ble ztrmm_kernel_L1_BEGIN + + mov pCRow0, pC // pCRow0 = pC + + add pC,pC,LDC, lsl #1 + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + +ztrmm_kernel_L2_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI,#0 + ble ztrmm_kernel_L2_M2_BEGIN + +ztrmm_kernel_L2_M4_20: + + INIT4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #6 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ztrmm_kernel_L2_M4_40 + .align 5 + +ztrmm_kernel_L2_M4_22: + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M4_22 + + +ztrmm_kernel_L2_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M4_100 + +ztrmm_kernel_L2_M4_42: + + KERNEL4x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M4_42 + +ztrmm_kernel_L2_M4_100: + + SAVE4x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L2_M4_END: + + subs counterI, counterI, #1 + bgt ztrmm_kernel_L2_M4_20 + + +ztrmm_kernel_L2_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L2_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L2_M1_BEGIN + +ztrmm_kernel_L2_M2_20: + + INIT2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL,#0 + ble ztrmm_kernel_L2_M2_40 + +ztrmm_kernel_L2_M2_22: + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M2_22 + + +ztrmm_kernel_L2_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M2_100 + +ztrmm_kernel_L2_M2_42: + + KERNEL2x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M2_42 + +ztrmm_kernel_L2_M2_100: + + SAVE2x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L2_M2_END: + + +ztrmm_kernel_L2_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L2_END + +ztrmm_kernel_L2_M1_20: + + INIT1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #5 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #2 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL, #0 + ble ztrmm_kernel_L2_M1_40 + +ztrmm_kernel_L2_M1_22: + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M1_22 + + +ztrmm_kernel_L2_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L2_M1_100 + +ztrmm_kernel_L2_M1_42: + + KERNEL1x2_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L2_M1_42 + +ztrmm_kernel_L2_M1_100: + + SAVE1x2 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #1 +#else + sub tempK, tempK, #2 +#endif + lsl temp, tempK, #4 + add pA, pA, temp + lsl temp, tempK, #5 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #1 +#endif + + +ztrmm_kernel_L2_END: +#if !defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 2 * 8 * 2 + +/******************************************************************************/ + +ztrmm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble ztrmm_kernel_L999 // done + + + mov pCRow0, pC // pCRow0 = C + add pC , pC , LDC // Update pC to point to next + +#if defined(LEFT) + mov tempOffset, offset +#endif + + mov pA, origPA // pA = A + + + +ztrmm_kernel_L1_M4_BEGIN: + + mov counterI, origM + asr counterI, counterI, #2 // counterI = counterI / 4 + cmp counterI, #0 + ble ztrmm_kernel_L1_M2_BEGIN + +ztrmm_kernel_L1_M4_20: + + INIT4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #6 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #4 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M4_40 + .align 5 + +ztrmm_kernel_L1_M4_22: + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M4_22 + + +ztrmm_kernel_L1_M4_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M4_100 + +ztrmm_kernel_L1_M4_42: + + KERNEL4x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M4_42 + +ztrmm_kernel_L1_M4_100: + + SAVE4x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #4 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #6 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #4 +#endif + +ztrmm_kernel_L1_M4_END: + + subs counterI, counterI, #1 + bgt ztrmm_kernel_L1_M4_20 + + +ztrmm_kernel_L1_M2_BEGIN: + + mov counterI, origM + tst counterI , #3 + ble ztrmm_kernel_L1_END + + tst counterI, #2 // counterI = counterI / 2 + ble ztrmm_kernel_L1_M1_BEGIN + +ztrmm_kernel_L1_M2_20: + + INIT2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #5 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #2 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M2_40 + +ztrmm_kernel_L1_M2_22: + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M2_22 + + +ztrmm_kernel_L1_M2_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M2_100 + +ztrmm_kernel_L1_M2_42: + + KERNEL2x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M2_42 + +ztrmm_kernel_L1_M2_100: + + SAVE2x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + sub tempK, origK, tempOffset +#if defined(LEFT) + sub tempK, tempK, #2 +#else + sub tempK, tempK, #1 +#endif + lsl temp, tempK, #5 + add pA, pA, temp + lsl temp, tempK, #4 + add pB, pB, temp +#endif +#if defined(LEFT) + add tempOffset, tempOffset, #2 +#endif + +ztrmm_kernel_L1_M2_END: + + +ztrmm_kernel_L1_M1_BEGIN: + + tst counterI, #1 // counterI = counterI % 2 + ble ztrmm_kernel_L1_END + +ztrmm_kernel_L1_M1_20: + + INIT1x1 + +#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) + mov pB, origPB +#else + mov pB, origPB + lsl temp, tempOffset, #4 + add pB, pB, temp + lsl temp, tempOffset, #4 + add pA, pA, temp +#endif + +#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + sub tempK, origK, tempOffset +#elif defined(LEFT) + add tempK, tempOffset, #1 +#else + add tempK, tempOffset, #1 +#endif + + asr counterL , tempK, #3 // counterL = counterL / 8 + cmp counterL , #0 + ble ztrmm_kernel_L1_M1_40 + +ztrmm_kernel_L1_M1_22: + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M1_22 + + +ztrmm_kernel_L1_M1_40: + + ands counterL , tempK, #7 // counterL = counterL % 8 + ble ztrmm_kernel_L1_M1_100 + +ztrmm_kernel_L1_M1_42: + + KERNEL1x1_SUB + + subs counterL, counterL, #1 + bgt ztrmm_kernel_L1_M1_42 + +ztrmm_kernel_L1_M1_100: + + SAVE1x1 + + +ztrmm_kernel_L1_END: + + +ztrmm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/generic/ztrmmkernel_4x4.c b/kernel/generic/ztrmmkernel_4x4.c new file mode 100755 index 000000000..9fc44a1e1 --- /dev/null +++ b/kernel/generic/ztrmmkernel_4x4.c @@ -0,0 +1,883 @@ +#include "common.h" + +#define MADD_ALPHA_N_STORE(C, res, alpha) \ + C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \ + C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r; + +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r -= op1 ## _i * op2 ## _i; \ + res ## _i += op1 ## _r * op2 ## _i; \ + res ## _i += op1 ## _i * op2 ## _r; +#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r += op1 ## _i * op2 ## _i; \ + res ## _i -= op1 ## _r * op2 ## _i; \ + res ## _i += op1 ## _i * op2 ## _r; +#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r += op1 ## _i * op2 ## _i; \ + res ## _i += op1 ## _r * op2 ## _i; \ + res ## _i -= op1 ## _i * op2 ## _r; +#elif defined(RR) || defined(RC) || defined(CR) || defined(CC) +#define MADD(res, op1, op2) \ + res ## _r += op1 ## _r * op2 ## _r; \ + res ## _r -= op1 ## _i * op2 ## _i; \ + res ## _i -= op1 ## _r * op2 ## _i; \ + res ## _i -= op1 ## _i * op2 ## _r; +#endif + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc + , BLASLONG offset + ) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + FLOAT res00_r, res01_r, res02_r, res03_r; + FLOAT res00_i, res01_i, res02_i, res03_i; + FLOAT res10_r, res11_r, res12_r, res13_r; + FLOAT res10_i, res11_i, res12_i, res13_i; + FLOAT res20_r, res21_r, res22_r, res23_r; + FLOAT res20_i, res21_i, res22_i, res23_i; + FLOAT res30_r, res31_r, res32_r, res33_r; + FLOAT res30_i, res31_i, res32_i, res33_i; + FLOAT a0_r, a1_r; + FLOAT a0_i, a1_i; + FLOAT b0_r, b1_r, b2_r, b3_r; + FLOAT b0_i, b1_i, b2_i, b3_i; + BLASLONG off, temp; + +#if defined(TRMMKERNEL) && !defined(LEFT) + off = -offset; +#endif + + for (j=0; j