From 925d4e1dc69a94de5733c03b022b7c2c7521935e Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:46:01 +0530 Subject: [PATCH 1/5] Add IAMAX and NRM2 benchmarks --- benchmark/Makefile | 92 +++++++++++++++++++++- benchmark/iamax.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ benchmark/nrm2.c | 190 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 benchmark/iamax.c create mode 100644 benchmark/nrm2.c diff --git a/benchmark/Makefile b/benchmark/Makefile index e78750ec2..e801ce4eb 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - smallscaling + smallscaling \ + isamax.goto idamax.goto icamax.goto izamax.goto \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ - ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas + ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ + isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ @@ -1937,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) zgemm3m.veclib : zgemm3m.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +isamax.atlas : isamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +idamax.atlas : idamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +icamax.atlas : icamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +izamax.atlas : izamax.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## SNRM2 ############################################## +snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +snrm2.atlas : snrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## DNRM2 ############################################## +dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dnrm2.atlas : dnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Sscnrm2 ############################################## +scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +scnrm2.atlas : scnrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + +############################################## Ddznrm2 ############################################## +dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +dznrm2.atlas : dznrm2.$(SUFFIX) + -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) + + ################################################################################################### slinpack.$(SUFFIX) : linpack.c @@ -2243,6 +2304,33 @@ cgemm3m.$(SUFFIX) : gemm3m.c zgemm3m.$(SUFFIX) : gemm3m.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + +snrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +scnrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +dznrm2.$(SUFFIX) : nrm2.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + + smallscaling: smallscaling.c ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread diff --git a/benchmark/iamax.c b/benchmark/iamax.c new file mode 100644 index 000000000..c55f41579 --- /dev/null +++ b/benchmark/iamax.c @@ -0,0 +1,190 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef IAMAX + +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + + +#undef NRM2 + +#ifdef COMPLEX +#ifdef DOUBLE +#define NRM2 BLASFUNC(dznrm2) +#else +#define NRM2 BLASFUNC(scnrm2) +#endif +#else +#ifdef DOUBLE +#define NRM2 BLASFUNC(dnrm2) +#else +#define NRM2 BLASFUNC(snrm2) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Time\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + + fprintf(stderr, " %6d : ", (int)m); + + + for (l=0; l Date: Thu, 14 Jul 2016 13:48:13 +0530 Subject: [PATCH 2/5] Add time prints in benchmark output --- benchmark/asum.c | 4 ++-- benchmark/axpy.c | 4 ++-- benchmark/copy.c | 4 ++-- benchmark/dot.c | 4 ++-- benchmark/gemv.c | 4 ++-- benchmark/rot.c | 4 ++-- benchmark/scal.c | 4 ++-- benchmark/swap.c | 4 ++-- benchmark/trmm.c | 4 ++-- benchmark/zdot.c | 4 ++-- 10 files changed, 20 insertions(+), 20 deletions(-) diff --git a/benchmark/asum.c b/benchmark/asum.c index beb6402f4..78ccdf47b 100644 --- a/benchmark/asum.c +++ b/benchmark/asum.c @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ timeg /= loops; #ifdef COMPLEX - fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else - fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif } diff --git a/benchmark/axpy.c b/benchmark/axpy.c index a7206b690..37c7aeb63 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/copy.c b/benchmark/copy.c index 15c45201c..ea5b38d68 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/dot.c b/benchmark/dot.c index 4c8d6cc38..50d05e532 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/gemv.c b/benchmark/gemv.c index 42af2825a..c06e829d9 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ timeg /= loops; - fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); } } diff --git a/benchmark/rot.c b/benchmark/rot.c index 32322bebb..3ff783cc6 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -186,8 +186,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/scal.c b/benchmark/scal.c index 4c2da4d30..453c3234d 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -189,9 +189,9 @@ int main(int argc, char *argv[]){ timeg /= loops; #ifdef COMPLEX - fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); #else - fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6); + fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg); #endif } diff --git a/benchmark/swap.c b/benchmark/swap.c index 9f108ef50..368c59cd4 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MBytes\n", - COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); + " %10.2f MBytes %10.6f sec\n", + COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index f81e9d912..54c7972db 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ gettimeofday( &start, (struct timezone *)0); fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); } diff --git a/benchmark/zdot.c b/benchmark/zdot.c index d5ec99726..ed9d4d2e8 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ timeg /= loops; fprintf(stderr, - " %10.2f MFlops\n", - COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); + " %10.2f MFlops %10.6f sec\n", + COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); } From 78782485b6f859d72be854ba6c2a0ec52d137adb Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:49:15 +0530 Subject: [PATCH 3/5] Improvements to COPY and IAMAX kernels --- kernel/arm64/copy.S | 46 ++++----- kernel/arm64/iamax.S | 184 +++++++++++++++++++++++++++++++++++ kernel/arm64/izamax.S | 217 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 424 insertions(+), 23 deletions(-) diff --git a/kernel/arm64/copy.S b/kernel/arm64/copy.S index 17aa5a1e8..70eab96fb 100644 --- a/kernel/arm64/copy.S +++ b/kernel/arm64/copy.S @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. str TMPF, [Y], #SZ #else #if !defined(DOUBLE) - ld1 {v0.2s}, [X], #8 - st1 {v0.2s}, [Y], #8 + ldr d0, [X], #8 + str d0, [Y], #8 #else - ld1 {v0.2d}, [X], #16 - st1 {v0.2d}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #endif #endif .endm .macro KERNEL_F4 - #if !defined(COMPLEX) #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + #endif #else // COMPLEX #if !defined(DOUBLE) - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 #else // DOUBLE - ld1 {v0.4s}, [X], #16 - ld1 {v1.4s}, [X], #16 - ld1 {v2.4s}, [X], #16 - ld1 {v3.4s}, [X], #16 - st1 {v0.4s}, [Y], #16 - st1 {v1.4s}, [Y], #16 - st1 {v2.4s}, [Y], #16 - st1 {v3.4s}, [Y], #16 + ldr q0, [X], #16 + str q0, [Y], #16 + ldr q1, [X], #16 + str q1, [Y], #16 + ldr q2, [X], #16 + str q2, [Y], #16 + ldr q3, [X], #16 + str q3, [Y], #16 #endif #endif diff --git a/kernel/arm64/iamax.S b/kernel/arm64/iamax.S index 575c15e53..6c0d84f98 100644 --- a/kernel/arm64/iamax.S +++ b/kernel/arm64/iamax.S @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fabs MAXF, MAXF .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #2 + add x7, x7, x6 + ldp q2, q3, [x7] + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + + ins v4.s[0], v3.s[0] + ins v5.s[0], v3.s[1] + ins v6.s[0], v3.s[2] + ins v7.s[0], v3.s[3] + + add x6, INDEX, #7 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + ins v5.s[0], v2.s[1] + ins v6.s[0], v2.s[2] + ins v7.s[0], v2.s[3] + + sub x6, x6, #1 + fcmp MAXF, s7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + add x6, x6, #4 + lsl x6, x6, #3 + add x7, x7, x6 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq + + sub x7, x7, #32 + ldp q2, q3, [x7] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + + ins v4.d[0], v2.d[0] + ins v5.d[0], v2.d[1] + ins v6.d[0], v3.d[0] + ins v7.d[0], v3.d[1] + + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d6 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d5 + csel INDEX, x6, INDEX, eq + + sub x6, x6, #1 + fcmp MAXF, d4 + csel INDEX, x6, INDEX, eq +#endif +.endm + + .macro KERNEL_S1 ld1 TMPVF, [X], INC_X add Z, Z, #1 @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + beq iamax_kernel_F1 + + add Z, Z, #1 +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 diff --git a/kernel/arm64/izamax.S b/kernel/arm64/izamax.S index ebdc671e0..9b252ec98 100644 --- a/kernel/arm64/izamax.S +++ b/kernel/arm64/izamax.S @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif .endm +.macro KERNEL_F8 +#if !defined(DOUBLE) + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + fmax v2.4s, v2.4s, v3.4s + fmaxv TMPF, v2.4s + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#else + ldp q2, q3, [X], #32 + ldp q4, q5, [X], #32 + ldp q16, q17, [X], #32 + ldp q18, q19, [X], #32 + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + fmax v2.2d, v2.2d, v3.2d + fmax v4.2d, v4.2d, v5.2d + fmax v2.2d, v2.2d, v4.2d + fmaxp TMPF, v2.2d + + fcmp MAXF, TMPF + fcsel MAXF, MAXF, TMPF, COND + csel INDEX, INDEX, Z, COND + add Z, Z, #8 +#endif + PRFM PLDL1KEEP, [X, #1024] +.endm + +.macro KERNEL_F8_FINALIZE + sub x6, INDEX, #1 +#if !defined(DOUBLE) + lsl x6, x6, #3 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + + fabs v2.4s, v2.4s + fabs v3.4s, v3.4s + fabs v4.4s, v4.4s + fabs v5.4s, v5.4s + + faddp v2.4s, v2.4s, v3.4s + faddp v3.4s, v4.4s, v5.4s + + ins v4.s[0], v3.s[3] + add x6, INDEX, #7 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v3.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[3] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[2] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[1] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq + + ins v4.s[0], v2.s[0] + sub x6, x6, #1 + fcmp MAXF, s4 + csel INDEX, x6, INDEX, eq +#else + lsl x6, x6, #4 + add x7, x7, x6 + + ldp q2, q3, [x7] + ldp q4, q5, [x7, #32] + ldp q16, q17, [x7, #64] + ldp q18, q19, [x7, #96] + + fabs v2.2d, v2.2d + fabs v3.2d, v3.2d + fabs v4.2d, v4.2d + fabs v5.2d, v5.2d + fabs v16.2d, v16.2d + fabs v17.2d, v17.2d + fabs v18.2d, v18.2d + fabs v19.2d, v19.2d + + faddp v2.2d, v2.2d, v3.2d + faddp v3.2d, v4.2d, v5.2d + faddp v4.2d, v16.2d, v17.2d + faddp v5.2d, v18.2d, v19.2d + + ins v7.d[0], v5.d[1] + add x6, INDEX, #7 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v5.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v4.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v3.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[1] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq + + ins v7.d[0], v2.d[0] + sub x6, x6, #1 + fcmp MAXF, d7 + csel INDEX, x6, INDEX, eq +#endif +.endm + .macro KERNEL_S1 #if !defined(DOUBLE) ld1 {v1.2s}, [X], INC_X @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. cmp INC_X, xzr ble iamax_kernel_zero + cmp INC_X, #1 + bne iamax_kernel_S_BEGIN + mov x7, X + + +iamax_kernel_F_BEGIN: + + INIT_S + + subs N, N, #1 + ble iamax_kernel_L999 + + asr I, N, #3 + cmp I, xzr + ble iamax_kernel_F1 + + add Z, Z, #1 + +iamax_kernel_F8: + + KERNEL_F8 + + subs I, I, #1 + bne iamax_kernel_F8 + + KERNEL_F8_FINALIZE + + sub Z, Z, #1 +iamax_kernel_F1: + + ands I, N, #7 + ble iamax_kernel_L999 + +iamax_kernel_F10: + + KERNEL_S1 + + subs I, I, #1 + bne iamax_kernel_F10 + + b iamax_kernel_L999 + +iamax_kernel_S_BEGIN: + INIT_S subs N, N, #1 From 8a40f1355e9711ce3d661c214f1644075c1e497b Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:50:38 +0530 Subject: [PATCH 4/5] Improvements to GEMV kernels --- kernel/arm64/gemv_n.S | 9 ++ kernel/arm64/gemv_t.S | 17 ++- kernel/arm64/zgemv_n.S | 275 ++++++++++++++++------------------------- kernel/arm64/zgemv_t.S | 9 +- 4 files changed, 141 insertions(+), 169 deletions(-) diff --git a/kernel/arm64/gemv_n.S b/kernel/arm64/gemv_n.S index 6279c2250..162f721c3 100644 --- a/kernel/arm64/gemv_n.S +++ b/kernel/arm64/gemv_n.S @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SHZ 3 #endif +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************/ .macro SAVE_REGS @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v2.4s, v3.4s}, [A_PTR], #32 ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 fmla v4.4s, v1.4s, v2.4s + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.4s, v1.4s, v3.4s st1 {v4.4s, v5.4s}, [Y_OPTR], #32 ld1 {v6.4s, v7.4s}, [A_PTR], #32 ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 fmla v8.4s, v1.4s, v6.4s + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.4s, v1.4s, v7.4s st1 {v8.4s, v9.4s}, [Y_OPTR], #32 #else //DOUBLE ld1 {v2.2d, v3.2d}, [A_PTR], #32 ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 fmla v4.2d, v1.2d, v2.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v5.2d, v1.2d, v3.2d st1 {v4.2d, v5.2d}, [Y_OPTR], #32 ld1 {v6.2d, v7.2d}, [A_PTR], #32 ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 fmla v8.2d, v1.2d, v6.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v9.2d, v1.2d, v7.2d st1 {v8.2d, v9.2d}, [Y_OPTR], #32 ld1 {v10.2d, v11.2d}, [A_PTR], #32 ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 fmla v12.2d, v1.2d, v10.2d + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] fmla v13.2d, v1.2d, v11.2d st1 {v12.2d, v13.2d}, [Y_OPTR], #32 ld1 {v14.2d, v15.2d}, [A_PTR], #32 ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 fmla v16.2d, v1.2d, v14.2d + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] fmla v17.2d, v1.2d, v15.2d st1 {v16.2d, v17.2d}, [Y_OPTR], #32 #endif diff --git a/kernel/arm64/gemv_t.S b/kernel/arm64/gemv_t.S index 0145af621..28325f784 100644 --- a/kernel/arm64/gemv_t.S +++ b/kernel/arm64/gemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define X_PREFETCH_SIZE 768 +#define A_PREFETCH_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 fmla v1.4s, v5.4s, v9.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v6.4s, v10.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v7.4s, v11.4s + ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 fmla v4.4s, v8.4s, v12.4s - ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 fmla v1.4s, v13.4s, v17.4s + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.4s, v14.4s, v18.4s + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.4s, v15.4s, v19.4s fmla v4.4s, v16.4s, v20.4s #else ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 fmla v1.2d, v5.2d, v9.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v6.2d, v10.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v7.2d, v11.2d fmla v4.2d, v8.2d, v12.2d ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 fmla v1.2d, v13.2d, v17.2d + prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] fmla v2.2d, v14.2d, v18.2d + prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] fmla v3.2d, v15.2d, v19.2d fmla v4.2d, v16.2d, v20.2d #endif diff --git a/kernel/arm64/zgemv_n.S b/kernel/arm64/zgemv_n.S index 9e285e299..a28d1b0ce 100644 --- a/kernel/arm64/zgemv_n.S +++ b/kernel/arm64/zgemv_n.S @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define Y_OPTR x13 /* loop Y vector address */ #define X_PTR x14 /* loop X vector address */ +#define A_PRE_SIZE 768 +#define Y_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(DOUBLE) #define ALPHA_R s0 #define ALPHA_I s1 -#define ALPHA_R_COPY s7 -#define ALPHA_I_COPY s8 #define SHZ 3 #else #define ALPHA_R d0 #define ALPHA_I d1 -#define ALPHA_R_COPY d7 -#define ALPHA_I_COPY d8 #define SHZ 4 #endif @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro INIT - /********** INIT FOR F4 LOOP **********/ - fmov ALPHA_R_COPY, ALPHA_R - fmov ALPHA_I_COPY, ALPHA_I -#if !defined(DOUBLE) - ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) - ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) - ins v7.d[1], v7.d[0] - ins v8.d[1], v8.d[0] -#else - ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) - ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) -#endif - - /******* INIT FOR F1 AND S1 LOOP ******/ #if !defined(DOUBLE) ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) eor v2.16b, v2.16b, v2.16b @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro INIT_LOOP - /********** INIT_LOOP FOR F4 LOOP **********/ #if !defined(DOUBLE) - ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] - ins v10.s[0], v9.s[1] - ins v9.s[1], v9.s[0] // [R(X), R(X)] - ins v10.s[1], v10.s[0] // [I(X), I(X)] - ins v9.d[1], v9.d[0] - ins v10.d[1], v10.d[0] -#if !defined(CONJ) -#if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] -#else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] - fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] -#endif -#else // CONJ -#if !defined(XCONJ) - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] - fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] -#else - fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] - fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] - fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] -#endif -#endif // CONJ - - /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] fmul v2.2s, v0.2s, v2.2s fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] ins v3.s[0], v2.s[1] + + /********** INIT_LOOP FOR F4 LOOP **********/ +#if !defined(CONJ) +#if !defined(XCONJ) + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // -I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] +#else + dup v21.4s, v2.s[0] // R[TEMP] + dup v22.4s, v2.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v24.4s, v25.s[0] // -I[TEMP] +#endif +#else // CONJ +#if !defined(XCONJ) + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + dup v23.4s, v3.s[0] // I[TEMP] + dup v24.4s, v3.s[0] // I[TEMP] +#else + dup v21.4s, v2.s[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s2 + dup v22.4s, v25.s[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub s25, s25, s3 + dup v23.4s, v25.s[0] // I[TEMP] + dup v24.4s, v25.s[0] // I[TEMP] +#endif +#endif // CONJ + + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif // CONJ #else // DOUBLE - - /********** INIT_LOOP FOR F4 LOOP **********/ - ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] - ins v10.d[0], v9.d[1] - ins v9.d[1], v9.d[0] // [R(X), R(X)] - ins v10.d[1], v10.d[0] // [I(X), I(X)] -#if !defined(CONJ) -#if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] -#else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] - fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] -#endif -#else // CONJ -#if !defined(XCONJ) - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] - fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] -#else - fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] - fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] - eor v12.16b, v12.16b, v12.16b - fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] - fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] -#endif -#endif // CONJ - - /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] fmul v2.2d, v0.2d, v2.2d fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] ins v3.d[0], v2.d[1] // I(TEMP) + + /****** INIT_LOOP FOR F4 LOOP ******/ +#if !defined(CONJ) +#if !defined(XCONJ) + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // -I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] +#else + dup v21.2d, v2.d[0] // R[TEMP] + dup v22.2d, v2.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v24.2d, v25.d[0] // -I[TEMP] +#endif +#else // CONJ +#if !defined(XCONJ) + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + dup v23.2d, v3.d[0] // I[TEMP] + dup v24.2d, v3.d[0] // I[TEMP] +#else + dup v21.2d, v2.d[0] // R[TEMP] + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d2 + dup v22.2d, v25.d[0] // R[TEMP] + + eor v25.16b, v25.16b, v25.16b + fsub d25, d25, d3 + dup v23.2d, v25.d[0] // I[TEMP] + dup v24.2d, v25.d[0] // I[TEMP] +#endif +#endif // CONJ + + + /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ #if !defined(CONJ) #if !defined(XCONJ) eor v4.16b, v4.16b, v4.16b @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v13.4s, v14.4s}, [A_PTR], #32 ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] -#else - fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] - fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] - fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] - fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v15.4s, v21.4s, v13.4s + fmla v15.4s, v23.4s, v14.4s + fmla v16.4s, v22.4s, v14.4s + fmla v16.4s, v24.4s, v13.4s + st2 {v15.4s, v16.4s}, [Y_OPTR], #32 #else // DOUBLE ld2 {v13.2d, v14.2d}, [A_PTR], #32 ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] - fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] - fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] - fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] + + fmla v15.2d, v21.2d, v13.2d + fmla v15.2d, v23.2d, v14.2d + fmla v16.2d, v22.2d, v14.2d + fmla v16.2d, v24.2d, v13.2d + st2 {v15.2d, v16.2d}, [Y_OPTR], #32 ld2 {v17.2d, v18.2d}, [A_PTR], #32 ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 -#if !defined(CONJ) -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#else // CONJ -#if !defined(XCONJ) - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#else - fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] - fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] - fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] - fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] -#endif -#endif // CONJ + prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] + + fmla v19.2d, v21.2d, v17.2d + fmla v19.2d, v23.2d, v18.2d + fmla v20.2d, v22.2d, v18.2d + fmla v20.2d, v24.2d, v17.2d + st2 {v19.2d, v20.2d}, [Y_OPTR], #32 #endif @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: zgemv_n_kernel_F4: - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 - KERNEL_F1 + KERNEL_F4 subs I, I, #1 bne zgemv_n_kernel_F4 diff --git a/kernel/arm64/zgemv_t.S b/kernel/arm64/zgemv_t.S index e61c17152..79ce9bcf2 100644 --- a/kernel/arm64/zgemv_t.S +++ b/kernel/arm64/zgemv_t.S @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define J x11 /* loop variable */ #define I x12 /* loop variable */ +#define A_PRE_SIZE 768 +#define X_PRE_SIZE 768 + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v11.4s, v12.4s}, [X_PTR], #32 ld2 {v13.4s, v14.4s}, [A_PTR], #32 + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else // DOUBLE ld2 {v11.2d, v12.2d}, [X_PTR], #32 ld2 {v13.2d, v14.2d}, [A_PTR], #32 - prfm PLDL1STRM, [X_PTR, #512] + prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v17.2d, v18.2d}, [X_PTR], #32 ld2 {v19.2d, v20.2d}, [A_PTR], #32 - prfm PLDL1STRM, [A_PTR, #512] + prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] From 0a5ff9f9f97e960589ca92618c677b72cb2e85fe Mon Sep 17 00:00:00 2001 From: Ashwin Sekhar T K Date: Thu, 14 Jul 2016 13:51:17 +0530 Subject: [PATCH 5/5] Improvements to TRMM and GEMM kernels --- kernel/arm64/cgemm_kernel_8x4.S | 586 ++++++++++++--------- kernel/arm64/ctrmm_kernel_8x4.S | 634 ++++++++++++++--------- kernel/arm64/dtrmm_kernel_8x4.S | 466 ++++++++++------- kernel/arm64/sgemm_kernel_16x4.S | 864 +++++++++++++++++-------------- kernel/arm64/strmm_kernel_16x4.S | 828 +++++++++++++++-------------- kernel/arm64/zgemm_kernel_4x4.S | 419 ++++++++------- kernel/arm64/ztrmm_kernel_4x4.S | 422 ++++++++------- param.h | 6 +- 8 files changed, 2396 insertions(+), 1829 deletions(-) diff --git a/kernel/arm64/cgemm_kernel_8x4.S b/kernel/arm64/cgemm_kernel_8x4.S index d58cef52d..5d1462808 100644 --- a/kernel/arm64/cgemm_kernel_8x4.S +++ b/kernel/arm64/cgemm_kernel_8x4.S @@ -46,17 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -95,8 +97,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 +// 15 pCRow3 +// 16 pA // 17 // 18 must save // 19 must save @@ -121,14 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -171,8 +173,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -189,6 +192,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -200,6 +206,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -211,6 +220,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -222,56 +234,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -280,47 +295,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -329,47 +353,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -388,157 +419,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - ld2 {v0.4s, v1.4s}, [pCRow1] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ld2 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] - add pCRow2, pCRow1, #32 + add pCRow0, pCRow0, #32 - ld2 {v2.4s, v3.4s}, [pCRow2] + ld2 {v2.4s, v3.4s}, [pCRow0] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] - add pCRow1, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 + add pCRow1, pCRow1, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow1] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] - add pCRow1, pCRow1, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - ld2 {v0.4s, v1.4s}, [pCRow1] + ld2 {v0.4s, v1.4s}, [pCRow2] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] - add pCRow2, pCRow1, #32 + add pCRow2, pCRow2, #32 ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmla v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmla v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.4s, v5.4s}, [pCRow1] + ld2 {v4.4s, v5.4s}, [pCRow3] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] - add pCRow2, pCRow1, #32 + add pCRow3, pCRow3, #32 - ld2 {v6.4s, v7.4s}, [pCRow2] + ld2 {v6.4s, v7.4s}, [pCRow3] fmla v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmla v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmla v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -720,13 +768,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -734,8 +785,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -743,8 +794,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmla v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmla v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -752,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmla v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmla v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -800,13 +851,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -814,8 +868,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -823,8 +877,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmla v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmla v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -832,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmla v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmla v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -880,13 +934,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -894,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -903,8 +960,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmla s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmla s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -912,8 +969,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmla s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmla s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -962,13 +1019,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -976,8 +1036,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow2] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -985,8 +1045,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -994,8 +1054,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v6.4s, v7.4s}, [pCRow2] fmla v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmla v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmla v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1028,13 +1088,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1042,8 +1105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmla v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmla v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1076,13 +1139,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1090,8 +1156,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2s, v5.2s}, [pCRow1] fmla v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmla v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmla v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1124,13 +1190,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1138,8 +1207,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.s, v5.s}[0], [pCRow1] fmla s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmla s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmla s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1174,13 +1243,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1188,8 +1260,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v2.4s, v3.4s}, [pCRow1] fmla v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmla v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmla v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1216,13 +1288,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.4s, v1.4s}, [pCRow1] fmla v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmla v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmla v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1248,13 +1323,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.2s, v1.2s}, [pCRow1] fmla v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmla v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmla v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1281,13 +1359,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 ld2 {v0.s, v1.s}[0], [pCRow1] fmla s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmla s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmla s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1313,10 +1394,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1330,8 +1412,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ cgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1342,44 +1428,69 @@ cgemm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble cgemm_kernel_L4_M4_BEGIN + .align 5 cgemm_kernel_L4_M8_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt cgemm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble cgemm_kernel_L4_M8_22a - .align 5 + .align 5 cgemm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt cgemm_kernel_L4_M8_22 - + .align 5 cgemm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 + .align 5 cgemm_kernel_L4_M8_32: tst counterL, #1 ble cgemm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b cgemm_kernel_L4_M8_44 @@ -1390,14 +1501,21 @@ cgemm_kernel_L4_M8_40: cgemm_kernel_L4_M8_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble cgemm_kernel_L4_M8_100 + .align 5 cgemm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne cgemm_kernel_L4_M8_46 + cgemm_kernel_L4_M8_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE8x4 diff --git a/kernel/arm64/ctrmm_kernel_8x4.S b/kernel/arm64/ctrmm_kernel_8x4.S index ce5cb0406..680fb56c3 100644 --- a/kernel/arm64/ctrmm_kernel_8x4.S +++ b/kernel/arm64/ctrmm_kernel_8x4.S @@ -46,20 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alphaR w17 +#define alphaI w18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R s10 #define alphaV0_R v10.s[0] #define alpha0_I s11 #define alphaV0_I v11.s[0] -#define alpha1_R s14 -#define alphaV1_R v14.s[0] -#define alpha1_I s15 -#define alphaV1_I v15.s[0] +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -124,14 +126,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_00_I, pA1_01_I, pA1_02_I, pA1_03_I //v06 pA1_04_R, pA1_05_R, pA1_06_R, pA1_07_R //v07 pA1_04_I, pA1_05_I, pA1_06_I, pA1_07_I -//v08 must save pB0_00_R, pB0_01_R, pB0_02_R, pB0_03_R -//v09 must save pB0_00_I, pB0_01_I, pB0_02_I, pB0_03_I -//v10 must save ALPHA0_R -//v11 must save ALPHA0_I -//v12 must save pB1_00_R, pB1_01_R, pB1_02_R, pB1_03_R -//v13 must save pB1_00_I, pB1_01_I, pB1_02_I, pB1_03_I -//v14 must save ALPHA1_R -//v15 must save ALPHA1_I +//v08 must save pB0_00_R, pB0_01_R +//v09 must save pB0_00_I, pB0_01_I +//v10 must save pB0_02_R, pB0_03_R --> ALPHA0_R +//v11 must save pB0_02_I, pB0_03_I --> ALPHA0_I +//v12 must save pB1_00_R, pB1_01_R +//v13 must save pB1_00_I, pB1_01_I +//v14 must save pB1_02_R, pB1_03_R +//v15 must save pB1_02_I, pB1_03_I //v16 must save pC_00_R, pC_01_R, pC_02_R, pC_03_R //v17 must save pC_00_I, pC_01_I, pC_02_I, pC_03_I //v18 pC_04_R, pC_05_R, pC_06_R, pC_07_R @@ -149,6 +151,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v30 pC_28_R, pC_29_R, pC_30_R, pC_31_R //v31 pC_28_I, pC_29_I, pC_30_I, pC_31_I + /******************************************************************************* * Macro definitions *******************************************************************************/ @@ -173,8 +176,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 ld2 {v2.4s, v3.4s}, [pA] @@ -191,6 +195,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + fmul v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -202,6 +209,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + fmul v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -213,6 +223,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 + fmul v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -224,56 +237,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.4s, v3.4s, v8.s[1] - fmul v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + + fmul v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v25.16b, v25.16b, v25.16b - fmls v25.4s, v0.4s, v9.s[2] + fmls v25.4s, v0.4s, v11.s[0] #else - fmul v25.4s, v0.4s, v9.s[2] + fmul v25.4s, v0.4s, v11.s[0] #endif - OP_ir v25.4s, v1.4s, v8.s[2] + OP_ir v25.4s, v1.4s, v10.s[0] - fmul v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + + fmul v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v27.16b, v27.16b, v27.16b - fmls v27.4s, v2.4s, v9.s[2] + fmls v27.4s, v2.4s, v11.s[0] #else - fmul v27.4s, v2.4s, v9.s[2] + fmul v27.4s, v2.4s, v11.s[0] #endif - OP_ir v27.4s, v3.4s, v8.s[2] + OP_ir v27.4s, v3.4s, v10.s[0] - fmul v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmul v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v29.16b, v29.16b, v29.16b - fmls v29.4s, v0.4s, v9.s[3] + fmls v29.4s, v0.4s, v11.s[1] #else - fmul v29.4s, v0.4s, v9.s[3] + fmul v29.4s, v0.4s, v11.s[1] #endif - OP_ir v29.4s, v1.4s, v8.s[3] + OP_ir v29.4s, v1.4s, v10.s[1] - fmul v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmul v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ defined(RR) || defined(RC) || defined(CR) || defined(CC) eor v31.16b, v31.16b, v31.16b - fmls v31.4s, v2.4s, v9.s[3] + fmls v31.4s, v2.4s, v11.s[1] #else - fmul v31.4s, v2.4s, v9.s[3] + fmul v31.4s, v2.4s, v11.s[1] #endif - OP_ir v31.4s, v3.4s, v8.s[3] - - ld2 {v12.4s, v13.4s}, [pB] - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M1 @@ -282,47 +298,56 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] + ld2 {v12.2s, v13.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v2.4s, v8.s[0] OP_ii v18.4s, v3.4s, v9.s[0] OP_ri v19.4s, v2.4s, v9.s[0] OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v4.4s, v5.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v6.4s, v7.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + ld2 {v14.2s, v15.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] - ld2 {v12.4s, v13.4s}, [pB] // For next round - add pB, pB, #32 - ld2 {v4.4s, v5.4s}, [pA] // For next round - add pA, pA, #32 - ld2 {v6.4s, v7.4s}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro KERNEL8x4_M2 @@ -331,47 +356,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.4s, v4.4s, v13.s[0] OP_ir v17.4s, v5.4s, v12.s[0] + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + OP_rr v18.4s, v6.4s, v12.s[0] OP_ii v18.4s, v7.4s, v13.s[0] OP_ri v19.4s, v6.4s, v13.s[0] OP_ir v19.4s, v7.4s, v12.s[0] + ld2 {v0.4s, v1.4s}, [pA] + add pA, pA, #32 + OP_rr v20.4s, v4.4s, v12.s[1] OP_ii v20.4s, v5.4s, v13.s[1] OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 - ld2 {v0.4s, v1.4s}, [pA] - add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_E @@ -390,157 +422,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.4s, v4.4s, v13.s[1] OP_ir v21.4s, v5.4s, v12.s[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.4s, v6.4s, v12.s[1] OP_ii v22.4s, v7.4s, v13.s[1] OP_ri v23.4s, v6.4s, v13.s[1] OP_ir v23.4s, v7.4s, v12.s[1] - OP_rr v24.4s, v4.4s, v12.s[2] - OP_ii v24.4s, v5.4s, v13.s[2] - OP_ri v25.4s, v4.4s, v13.s[2] - OP_ir v25.4s, v5.4s, v12.s[2] + OP_rr v24.4s, v4.4s, v14.s[0] + OP_ii v24.4s, v5.4s, v15.s[0] + OP_ri v25.4s, v4.4s, v15.s[0] + OP_ir v25.4s, v5.4s, v14.s[0] - OP_rr v26.4s, v6.4s, v12.s[2] - OP_ii v26.4s, v7.4s, v13.s[2] - OP_ri v27.4s, v6.4s, v13.s[2] - OP_ir v27.4s, v7.4s, v12.s[2] + OP_rr v26.4s, v6.4s, v14.s[0] + OP_ii v26.4s, v7.4s, v15.s[0] + OP_ri v27.4s, v6.4s, v15.s[0] + OP_ir v27.4s, v7.4s, v14.s[0] - OP_rr v28.4s, v4.4s, v12.s[3] - OP_ii v28.4s, v5.4s, v13.s[3] - OP_ri v29.4s, v4.4s, v13.s[3] - OP_ir v29.4s, v5.4s, v12.s[3] - - OP_rr v30.4s, v6.4s, v12.s[3] - OP_ii v30.4s, v7.4s, v13.s[3] - OP_ri v31.4s, v6.4s, v13.s[3] - OP_ir v31.4s, v7.4s, v12.s[3] + OP_rr v28.4s, v4.4s, v14.s[1] + OP_ii v28.4s, v5.4s, v15.s[1] + OP_ri v29.4s, v4.4s, v15.s[1] + OP_ir v29.4s, v5.4s, v14.s[1] + OP_rr v30.4s, v6.4s, v14.s[1] + OP_ii v30.4s, v7.4s, v15.s[1] + OP_ri v31.4s, v6.4s, v15.s[1] + OP_ir v31.4s, v7.4s, v14.s[1] .endm .macro KERNEL8x4_SUB - ld2 {v8.4s, v9.4s}, [pB] - add pB, pB, #32 + ld2 {v8.2s, v9.2s}, [pB] + add pB, pB, #16 + ld2 {v0.4s, v1.4s}, [pA] add pA, pA, #32 - ld2 {v2.4s, v3.4s}, [pA] - add pA, pA, #32 OP_rr v16.4s, v0.4s, v8.s[0] OP_ii v16.4s, v1.4s, v9.s[0] OP_ri v17.4s, v0.4s, v9.s[0] OP_ir v17.4s, v1.4s, v8.s[0] - OP_rr v18.4s, v2.4s, v8.s[0] - OP_ii v18.4s, v3.4s, v9.s[0] - OP_ri v19.4s, v2.4s, v9.s[0] - OP_ir v19.4s, v3.4s, v8.s[0] + ld2 {v2.4s, v3.4s}, [pA] + add pA, pA, #32 OP_rr v20.4s, v0.4s, v8.s[1] OP_ii v20.4s, v1.4s, v9.s[1] OP_ri v21.4s, v0.4s, v9.s[1] OP_ir v21.4s, v1.4s, v8.s[1] + ld2 {v10.2s, v11.2s}, [pB] + add pB, pB, #16 + + OP_rr v18.4s, v2.4s, v8.s[0] + OP_ii v18.4s, v3.4s, v9.s[0] + OP_ri v19.4s, v2.4s, v9.s[0] + OP_ir v19.4s, v3.4s, v8.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v22.4s, v2.4s, v8.s[1] OP_ii v22.4s, v3.4s, v9.s[1] OP_ri v23.4s, v2.4s, v9.s[1] OP_ir v23.4s, v3.4s, v8.s[1] - OP_rr v24.4s, v0.4s, v8.s[2] - OP_ii v24.4s, v1.4s, v9.s[2] - OP_ri v25.4s, v0.4s, v9.s[2] - OP_ir v25.4s, v1.4s, v8.s[2] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] - OP_rr v26.4s, v2.4s, v8.s[2] - OP_ii v26.4s, v3.4s, v9.s[2] - OP_ri v27.4s, v2.4s, v9.s[2] - OP_ir v27.4s, v3.4s, v8.s[2] + OP_rr v24.4s, v0.4s, v10.s[0] + OP_ii v24.4s, v1.4s, v11.s[0] + OP_ri v25.4s, v0.4s, v11.s[0] + OP_ir v25.4s, v1.4s, v10.s[0] - OP_rr v28.4s, v0.4s, v8.s[3] - OP_ii v28.4s, v1.4s, v9.s[3] - OP_ri v29.4s, v0.4s, v9.s[3] - OP_ir v29.4s, v1.4s, v8.s[3] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - OP_rr v30.4s, v2.4s, v8.s[3] - OP_ii v30.4s, v3.4s, v9.s[3] - OP_ri v31.4s, v2.4s, v9.s[3] - OP_ir v31.4s, v3.4s, v8.s[3] + OP_rr v26.4s, v2.4s, v10.s[0] + OP_ii v26.4s, v3.4s, v11.s[0] + OP_ri v27.4s, v2.4s, v11.s[0] + OP_ir v27.4s, v3.4s, v10.s[0] + OP_rr v28.4s, v0.4s, v10.s[1] + OP_ii v28.4s, v1.4s, v11.s[1] + OP_ri v29.4s, v0.4s, v11.s[1] + OP_ir v29.4s, v1.4s, v10.s[1] + + OP_rr v30.4s, v2.4s, v10.s[1] + OP_ii v30.4s, v3.4s, v11.s[1] + OP_ri v31.4s, v2.4s, v11.s[1] + OP_ir v31.4s, v3.4s, v10.s[1] .endm .macro SAVE8x4 - mov pCRow1, pCRow0 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow0] + add pCRow0, pCRow0, #32 fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R - st2 {v2.4s, v3.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R + st2 {v2.4s, v3.4s}, [pCRow0] + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] - add pCRow2, pCRow1, #32 - + add pCRow1, pCRow1, #32 fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] - - add pCRow1, pCRow1, LDC + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R - st2 {v0.4s, v1.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R + st2 {v0.4s, v1.4s}, [pCRow2] + add pCRow2, pCRow2, #32 fmul v2.4s, v26.4s, alphaV0_R fmls v2.4s, v27.4s, alphaV0_I - fmul v3.4s, v26.4s, alphaV1_I - fmla v3.4s, v27.4s, alphaV1_R + fmul v3.4s, v26.4s, alphaV0_I + fmla v3.4s, v27.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] - add pCRow1, pCRow1, LDC - + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R - st2 {v4.4s, v5.4s}, [pCRow1] - - add pCRow2, pCRow1, #32 + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R + st2 {v4.4s, v5.4s}, [pCRow3] + add pCRow3, pCRow3, #32 fmul v6.4s, v30.4s, alphaV0_R fmls v6.4s, v31.4s, alphaV0_I - fmul v7.4s, v30.4s, alphaV1_I - fmla v7.4s, v31.4s, alphaV1_R - st2 {v6.4s, v7.4s}, [pCRow2] + fmul v7.4s, v30.4s, alphaV0_I + fmla v7.4s, v31.4s, alphaV0_R + st2 {v6.4s, v7.4s}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -722,13 +763,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -736,8 +780,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -745,8 +789,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.4s, v24.4s, alphaV0_R fmls v0.4s, v25.4s, alphaV0_I - fmul v1.4s, v24.4s, alphaV1_I - fmla v1.4s, v25.4s, alphaV1_R + fmul v1.4s, v24.4s, alphaV0_I + fmla v1.4s, v25.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -754,8 +798,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v28.4s, alphaV0_R fmls v4.4s, v29.4s, alphaV0_I - fmul v5.4s, v28.4s, alphaV1_I - fmla v5.4s, v29.4s, alphaV1_R + fmul v5.4s, v28.4s, alphaV0_I + fmla v5.4s, v29.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -802,13 +846,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -816,8 +863,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -825,8 +872,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v0.2s, v24.2s, alphaV0_R fmls v0.2s, v25.2s, alphaV0_I - fmul v1.2s, v24.2s, alphaV1_I - fmla v1.2s, v25.2s, alphaV1_R + fmul v1.2s, v24.2s, alphaV0_I + fmla v1.2s, v25.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -834,8 +881,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v28.2s, alphaV0_R fmls v4.2s, v29.2s, alphaV0_I - fmul v5.2s, v28.2s, alphaV1_I - fmla v5.2s, v29.2s, alphaV1_R + fmul v5.2s, v28.2s, alphaV0_I + fmla v5.2s, v29.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -882,13 +929,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -896,8 +946,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -905,8 +955,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s0, s24, alphaV0_R fmls s0, s25, alphaV0_I - fmul s1, s24, alphaV1_I - fmla s1, s25, alphaV1_R + fmul s1, s24, alphaV0_I + fmla s1, s25, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -914,8 +964,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s28, alphaV0_R fmls s4, s29, alphaV0_I - fmul s5, s28, alphaV1_I - fmla s5, s29, alphaV1_R + fmul s5, s28, alphaV0_I + fmla s5, s29, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -964,13 +1014,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -978,8 +1031,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow2] add pCRow1, pCRow1, LDC @@ -987,8 +1040,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow2, pCRow1, #32 @@ -996,8 +1049,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v6.4s, v22.4s, alphaV0_R fmls v6.4s, v23.4s, alphaV0_I - fmul v7.4s, v22.4s, alphaV1_I - fmla v7.4s, v23.4s, alphaV1_R + fmul v7.4s, v22.4s, alphaV0_I + fmla v7.4s, v23.4s, alphaV0_R st2 {v6.4s, v7.4s}, [pCRow2] add pCRow0, pCRow0, #64 @@ -1030,13 +1083,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1044,8 +1100,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.4s, v20.4s, alphaV0_R fmls v4.4s, v21.4s, alphaV0_I - fmul v5.4s, v20.4s, alphaV1_I - fmla v5.4s, v21.4s, alphaV1_R + fmul v5.4s, v20.4s, alphaV0_I + fmla v5.4s, v21.4s, alphaV0_R st2 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1078,13 +1134,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow1, pCRow1, LDC @@ -1092,8 +1151,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v4.2s, v20.2s, alphaV0_R fmls v4.2s, v21.2s, alphaV0_I - fmul v5.2s, v20.2s, alphaV1_I - fmla v5.2s, v21.2s, alphaV1_R + fmul v5.2s, v20.2s, alphaV0_I + fmla v5.2s, v21.2s, alphaV0_R st2 {v4.2s, v5.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1126,13 +1185,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -1140,8 +1202,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul s4, s20, alphaV0_R fmls s4, s21, alphaV0_I - fmul s5, s20, alphaV1_I - fmla s5, s21, alphaV1_R + fmul s5, s20, alphaV0_I + fmla s5, s21, alphaV0_R st2 {v4.s, v5.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1176,13 +1238,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow1, pCRow1, #32 @@ -1190,8 +1255,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. fmul v2.4s, v18.4s, alphaV0_R fmls v2.4s, v19.4s, alphaV0_I - fmul v3.4s, v18.4s, alphaV1_I - fmla v3.4s, v19.4s, alphaV1_R + fmul v3.4s, v18.4s, alphaV0_I + fmla v3.4s, v19.4s, alphaV0_R st2 {v2.4s, v3.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -1218,13 +1283,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.4s, v16.4s, alphaV0_R fmls v0.4s, v17.4s, alphaV0_I - fmul v1.4s, v16.4s, alphaV1_I - fmla v1.4s, v17.4s, alphaV1_R + fmul v1.4s, v16.4s, alphaV0_I + fmla v1.4s, v17.4s, alphaV0_R st2 {v0.4s, v1.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1250,13 +1318,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul v0.2s, v16.2s, alphaV0_R fmls v0.2s, v17.2s, alphaV0_I - fmul v1.2s, v16.2s, alphaV1_I - fmla v1.2s, v17.2s, alphaV1_R + fmul v1.2s, v16.2s, alphaV0_I + fmla v1.2s, v17.2s, alphaV0_R st2 {v0.2s, v1.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -1283,13 +1354,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI + mov pCRow1, pCRow0 fmul s0, s16, alphaV0_R fmls s0, s17, alphaV0_I - fmul s1, s16, alphaV1_I - fmla s1, s17, alphaV1_R + fmul s1, s16, alphaV0_I + fmla s1, s17, alphaV0_R st2 {v0.s, v1.s}[0], [pCRow1] add pCRow0, pCRow0, #8 @@ -1315,10 +1389,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0_R, s0 - fmov alpha0_I, s1 - fmov alpha1_R, s0 - fmov alpha1_I, s1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, s0 + fmov alphaI, s1 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -1335,8 +1410,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ ctrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1370,40 +1450,64 @@ ctrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ctrmm_kernel_L4_M8_32 - KERNEL8x4_I // do one in the K - KERNEL8x4_M2 // do another in the K + KERNEL8x4_I + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble ctrmm_kernel_L4_M8_22a - .align 5 + .align 5 ctrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M8_22 - + .align 5 ctrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 + .align 5 ctrmm_kernel_L4_M8_32: tst counterL, #1 ble ctrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b ctrmm_kernel_L4_M8_44 @@ -1414,13 +1518,17 @@ ctrmm_kernel_L4_M8_40: ctrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ctrmm_kernel_L4_M8_100 + .align 5 ctrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne ctrmm_kernel_L4_M8_46 + ctrmm_kernel_L4_M8_100: SAVE8x4 @@ -1440,6 +1548,9 @@ ctrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] ctrmm_kernel_L4_M8_END: subs counterI, counterI, #1 @@ -1454,9 +1565,8 @@ ctrmm_kernel_L4_M4_BEGIN: tst counterI, #4 ble ctrmm_kernel_L4_M2_BEGIN -ctrmm_kernel_L4_M4_20: - INIT4x4 +ctrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) mov pB, origPB @@ -1475,38 +1585,47 @@ ctrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #3 // counterL = counterL / 8 - cmp counterL , #0 - ble ctrmm_kernel_L4_M4_40 + asr counterL , tempK, #1 // L = K / 2 + cmp counterL , #2 // is there at least 4 to do? + blt ctrmm_kernel_L4_M4_32 + + KERNEL4x4_I // do one in the K + KERNEL4x4_M2 // do another in the K + + subs counterL, counterL, #2 + ble ctrmm_kernel_L4_M4_22a + .align 5 + ctrmm_kernel_L4_M4_22: - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB - KERNEL4x4_SUB + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ctrmm_kernel_L4_M4_22 - +ctrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 +ctrmm_kernel_L4_M4_32: + tst counterL, #1 + ble ctrmm_kernel_L4_M4_40 + KERNEL4x4_I + KERNEL4x4_E + b ctrmm_kernel_L4_M4_44 ctrmm_kernel_L4_M4_40: - ands counterL , tempK, #7 // counterL = counterL % 8 + INIT4x4 + +ctrmm_kernel_L4_M4_44: + ands counterL , tempK, #1 ble ctrmm_kernel_L4_M4_100 -ctrmm_kernel_L4_M4_42: - +ctrmm_kernel_L4_M4_46: KERNEL4x4_SUB - subs counterL, counterL, #1 - bgt ctrmm_kernel_L4_M4_42 - ctrmm_kernel_L4_M4_100: SAVE4x4 @@ -1528,7 +1647,6 @@ ctrmm_kernel_L4_M4_100: ctrmm_kernel_L4_M4_END: - ctrmm_kernel_L4_M2_BEGIN: mov counterI, origM diff --git a/kernel/arm64/dtrmm_kernel_8x4.S b/kernel/arm64/dtrmm_kernel_8x4.S index b06c7560d..2b8173715 100644 --- a/kernel/arm64/dtrmm_kernel_8x4.S +++ b/kernel/arm64/dtrmm_kernel_8x4.S @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha x17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 d10 #define alphaV0 v10.d[0] -#define alpha1 d11 -#define alphaV1 v11.d[0] -#define alpha2 d14 -#define alphaV2 v14.d[0] -#define alpha3 d15 -#define alphaV3 v15.d[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 // 00 origM // 01 origN @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_2, pA1_3 //v06 pA1_4, pA1_5 //v07 pA1_6, pA1_7 -//v08 must save pB0_0, pB0_1 -//v09 must save pB0_2, pB0_3 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB1_0, pB1_1 -//v13 must save pB1_2, pB1_3 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 --> ALPHA0 +//v11 must save pB0_3 +//v12 must save pB1_0 +//v13 must save pB1_1 +//v14 must save pB1_2 +//v15 must save pB1_3 //v16 must save C00, C01 //v17 must save C02, C03 //v18 C04, C05 @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmul v16.2d, v0.2d, v8.d[0] + fmul v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmul v17.2d, v1.2d, v8.d[0] + fmul v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmul v24.2d, v0.2d, v10.d[0] + fmul v28.2d, v0.2d, v11.d[0] + + ldp q4, q5, [pA], #32 + + fmul v25.2d, v1.2d, v10.d[0] + fmul v29.2d, v1.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmul v18.2d, v2.2d, v8.d[0] + fmul v22.2d, v2.2d, v9.d[0] + + ldp d14, d15, [pB], #16 + + fmul v26.2d, v2.2d, v10.d[0] + fmul v30.2d, v2.2d, v11.d[0] + + ldp q6, q7, [pA], #32 + fmul v19.2d, v3.2d, v8.d[0] + fmul v27.2d, v3.2d, v10.d[0] - fmul v20.2d, v0.2d, v8.d[1] - fmul v21.2d, v1.2d, v8.d[1] - fmul v22.2d, v2.2d, v8.d[1] - fmul v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - fmul v24.2d, v0.2d, v9.d[0] - fmul v25.2d, v1.2d, v9.d[0] - fmul v26.2d, v2.2d, v9.d[0] - fmul v27.2d, v3.2d, v9.d[0] + fmul v31.2d, v3.2d, v11.d[0] + fmul v23.2d, v3.2d, v9.d[0] - fmul v28.2d, v0.2d, v9.d[1] - fmul v29.2d, v1.2d, v9.d[1] - fmul v30.2d, v2.2d, v9.d[1] - fmul v31.2d, v3.2d, v9.d[1] - - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL8x4_M1 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp q4, q5, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + ldp d12, d13, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v25.2d, v1.2d, v10.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v21.2d, v1.2d, v9.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + ldp d14, d15, [pB], #16 + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] fmla v19.2d, v3.2d, v8.d[0] + fmla v23.2d, v3.2d, v9.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] + ldp q6, q7, [pA], #32 - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] - - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] - - ld1 {v4.2d, v5.2d}, [pA] - add pA, pA, #32 - ld1 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld1 {v6.2d, v7.2d}, [pA] - add pA, pA, #32 - - prfm PLDL1KEEP, [pA, #512] + fmla v27.2d, v3.2d, v10.d[0] + fmla v31.2d, v3.2d, v11.d[0] .endm .macro KERNEL8x4_M2 fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + + ldp q0, q1, [pA], #32 + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + + ldp d8, d9, [pB], #16 + + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + ldp d10, d11, [pB], #16 + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] + fmla v23.2d, v7.2d, v13.d[0] - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] + ldp q2, q3, [pA], #32 - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] - - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] - - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 - - prfm PLDL1KEEP, [pB, #512] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_E fmla v16.2d, v4.2d, v12.d[0] + fmla v20.2d, v4.2d, v13.d[0] + fmla v24.2d, v4.2d, v14.d[0] + fmla v28.2d, v4.2d, v15.d[0] + fmla v17.2d, v5.2d, v12.d[0] + fmla v25.2d, v5.2d, v14.d[0] + fmla v21.2d, v5.2d, v13.d[0] + fmla v29.2d, v5.2d, v15.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.2d, v6.2d, v12.d[0] + fmla v22.2d, v6.2d, v13.d[0] + fmla v26.2d, v6.2d, v14.d[0] + fmla v30.2d, v6.2d, v15.d[0] + fmla v19.2d, v7.2d, v12.d[0] - - fmla v20.2d, v4.2d, v12.d[1] - fmla v21.2d, v5.2d, v12.d[1] - fmla v22.2d, v6.2d, v12.d[1] - fmla v23.2d, v7.2d, v12.d[1] - - fmla v24.2d, v4.2d, v13.d[0] - fmla v25.2d, v5.2d, v13.d[0] - fmla v26.2d, v6.2d, v13.d[0] - fmla v27.2d, v7.2d, v13.d[0] - - fmla v28.2d, v4.2d, v13.d[1] - fmla v29.2d, v5.2d, v13.d[1] - fmla v30.2d, v6.2d, v13.d[1] - fmla v31.2d, v7.2d, v13.d[1] + fmla v23.2d, v7.2d, v13.d[0] + fmla v27.2d, v7.2d, v14.d[0] + fmla v31.2d, v7.2d, v15.d[0] .endm .macro KERNEL8x4_SUB - ld1 {v0.2d, v1.2d}, [pA] - add pA, pA, #32 - ld1 {v8.2d, v9.2d}, [pB] - add pB, pB, #32 - ld1 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 + ldp q0, q1, [pA], #32 + + ldp d8, d9, [pB], #16 fmla v16.2d, v0.2d, v8.d[0] + fmla v20.2d, v0.2d, v9.d[0] + + ldp d10, d11, [pB], #16 + fmla v17.2d, v1.2d, v8.d[0] + fmla v21.2d, v1.2d, v9.d[0] + + ldp q2, q3, [pA], #32 + + fmla v24.2d, v0.2d, v10.d[0] + fmla v28.2d, v0.2d, v11.d[0] + + fmla v25.2d, v1.2d, v10.d[0] + fmla v29.2d, v1.2d, v11.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v18.2d, v2.2d, v8.d[0] + fmla v22.2d, v2.2d, v9.d[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.2d, v2.2d, v10.d[0] + fmla v30.2d, v2.2d, v11.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v19.2d, v3.2d, v8.d[0] + fmla v27.2d, v3.2d, v10.d[0] - fmla v20.2d, v0.2d, v8.d[1] - fmla v21.2d, v1.2d, v8.d[1] - fmla v22.2d, v2.2d, v8.d[1] - fmla v23.2d, v3.2d, v8.d[1] - - fmla v24.2d, v0.2d, v9.d[0] - fmla v25.2d, v1.2d, v9.d[0] - fmla v26.2d, v2.2d, v9.d[0] - fmla v27.2d, v3.2d, v9.d[0] - - fmla v28.2d, v0.2d, v9.d[1] - fmla v29.2d, v1.2d, v9.d[1] - fmla v30.2d, v2.2d, v9.d[1] - fmla v31.2d, v3.2d, v9.d[1] + fmla v31.2d, v3.2d, v11.d[0] + fmla v23.2d, v3.2d, v9.d[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] + fmul v1.2d, v17.2d, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v21.2d, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.2d, v24.2d, alphaV0 - fmul v1.2d, v25.2d, alphaV1 - fmul v2.2d, v26.2d, alphaV2 - fmul v3.2d, v27.2d, alphaV3 - st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] + fmul v1.2d, v25.2d, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + fmul v2.2d, v26.2d, alphaV0 + fmul v3.2d, v27.2d, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0 - fmul v5.2d, v29.2d, alphaV1 - fmul v6.2d, v30.2d, alphaV2 - fmul v7.2d, v31.2d, alphaV3 - st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] + fmul v5.2d, v29.2d, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + fmul v6.2d, v30.2d, alphaV0 + fmul v7.2d, v31.2d, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow2, pCRow1, LDC fmul v8.2d, v24.2d, alphaV0 - fmul v9.2d, v25.2d, alphaV1 + fmul v9.2d, v25.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV2 - fmul v13.2d, v29.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 + fmul v13.2d, v29.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow2, pCRow1, LDC - fmul v8.2d, v24.2d, alphaV2 + fmul v8.2d, v24.2d, alphaV0 st1 {v8.2d}, [pCRow2] add pCRow1, pCRow2, LDC - fmul v12.2d, v28.2d, alphaV3 + fmul v12.2d, v28.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. add pCRow2, pCRow1, LDC add pCRow1, pCRow2, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.d}[0], [pCRow2] st1 {v12.d}[1], [pCRow1] @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha add pCRow1, pCRow0, LDC fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] fmul v4.2d, v20.2d, alphaV0 - fmul v5.2d, v21.2d, alphaV1 - fmul v6.2d, v22.2d, alphaV2 - fmul v7.2d, v23.2d, alphaV3 + fmul v5.2d, v21.2d, alphaV0 + fmul v6.2d, v22.2d, alphaV0 + fmul v7.2d, v23.2d, alphaV0 st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] add pCRow0, pCRow0, #64 @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2d, v20.2d, alphaV2 - fmul v13.2d, v21.2d, alphaV3 + fmul v12.2d, v20.2d, alphaV0 + fmul v13.2d, v21.2d, alphaV0 st1 {v12.2d, v13.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2d, v20.2d, alphaV1 + fmul v12.2d, v20.2d, alphaV0 st1 {v12.2d}, [pCRow1] add pCRow0, pCRow0, #16 @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha add pCRow1 , pCRow0, LDC fmul v8.2d, v16.2d, alphaV0 @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.2d, v16.2d, alphaV0 - fmul v1.2d, v17.2d, alphaV1 - fmul v2.2d, v18.2d, alphaV2 - fmul v3.2d, v19.2d, alphaV3 + fmul v1.2d, v17.2d, alphaV0 + fmul v2.2d, v18.2d, alphaV0 + fmul v3.2d, v19.2d, alphaV0 st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] add pCRow0, pCRow0, #64 @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 - fmul v9.2d, v17.2d, alphaV1 + fmul v9.2d, v17.2d, alphaV0 st1 {v8.2d, v9.2d}, [pCRow0] add pCRow0, pCRow0, #32 @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2d, v16.2d, alphaV0 st1 {v8.2d}, [pCRow0] @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul d8, d16, alpha0 str d8, [pCRow0] @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, d0 - fmov alpha1, d0 - fmov alpha2, d0 - fmov alpha3, d0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, d0 lsl LDC, LDC, #3 // ldc = ldc * 8 @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /******************************************************************************/ dtrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: cmp counterI, #0 ble dtrmm_kernel_L4_M4_BEGIN + .align 5 dtrmm_kernel_L4_M8_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 + asr counterL , tempK, #3 // L = K / 8 cmp counterL , #2 // is there at least 4 to do? blt dtrmm_kernel_L4_M8_32 KERNEL8x4_I // do one in the K KERNEL8x4_M2 // do another in the K + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #2 // subtract 2 ble dtrmm_kernel_L4_M8_22a - .align 5 + .align 5 dtrmm_kernel_L4_M8_22: KERNEL8x4_M1 KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 subs counterL, counterL, #1 bgt dtrmm_kernel_L4_M8_22 - + .align 5 dtrmm_kernel_L4_M8_22a: + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 + .align 5 dtrmm_kernel_L4_M8_32: tst counterL, #1 ble dtrmm_kernel_L4_M8_40 KERNEL8x4_I - + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 + KERNEL8x4_M2 + KERNEL8x4_M1 KERNEL8x4_E b dtrmm_kernel_L4_M8_44 @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: dtrmm_kernel_L4_M8_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble dtrmm_kernel_L4_M8_100 + .align 5 dtrmm_kernel_L4_M8_46: KERNEL8x4_SUB + subs counterL, counterL, #1 + bne dtrmm_kernel_L4_M8_46 + dtrmm_kernel_L4_M8_100: SAVE8x4 @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: #if defined(LEFT) add tempOffset, tempOffset, #8 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] dtrmm_kernel_L4_M8_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/sgemm_kernel_16x4.S b/kernel/arm64/sgemm_kernel_16x4.S index 68366d9f2..6e3645b76 100644 --- a/kernel/arm64/sgemm_kernel_16x4.S +++ b/kernel/arm64/sgemm_kernel_16x4.S @@ -46,16 +46,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -98,14 +99,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -147,206 +148,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] + fmla v19.4s, v7.4s, v12.s[0] - - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] - - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] - - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] + + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + ldp q2, q3, [pCRow0] + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] + + ldp q4, q5, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 - ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + ldp q6, q7, [pCRow1] + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ldp q0, q1, [pCRow2] fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - fmla v2.4s, v26.4s, alphaV2 - fmla v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmla v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] - ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + add pCRow2, pCRow2, #32 + + ldp q2, q3, [pCRow2] + fmla v2.4s, v26.4s, alphaV0 + fmla v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] + + ldp q4, q5, [pCRow3] fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - fmla v6.4s, v30.4s, alphaV2 - fmla v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmla v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v30.4s, alphaV0 + fmla v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -363,264 +407,217 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha - ld1 {v0.4s, v1.4s}, [pCRow0] + ldp q0, q1, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] - - add pCRow2, pCRow1, LDC - - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] - - add pCRow1, pCRow2, LDC - - ld1 {v0.4s, v1.4s}, [pCRow2] - fmla v0.4s, v24.4s, alphaV0 - fmla v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] - - ld1 {v4.4s, v5.4s}, [pCRow1] - fmla v4.4s, v28.4s, alphaV0 - fmla v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmla v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + + ldp q2, q3, [pCRow1] + fmla v2.4s, v20.4s, alphaV0 + fmla v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] + + add pCRow1, pCRow1, #32 + + ldp q4, q5, [pCRow2] + fmla v4.4s, v24.4s, alphaV0 + fmla v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] + + add pCRow2, pCRow2, #32 + + ldp q6, q7, [pCRow3] + fmla v6.4s, v28.4s, alphaV0 + fmla v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 - ld1 {v8.2s, v9.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - ld1 {v8.2s, v9.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV0 - fmla v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV2 - fmla v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + ldr q0, [pCRow0] + fmla v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow0, pCRow0, #16 + + ldr q1, [pCRow1] + fmla v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] + + add pCRow1, pCRow1, #16 + + ldr q2, [pCRow2] + fmla v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] + + add pCRow2, pCRow2, #16 + + ldr q3, [pCRow3] + fmla v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] + + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -633,38 +630,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - ld1 {v8.2s}, [pCRow0] - fmla v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - ld1 {v8.2s}, [pCRow2] - fmla v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + ldr d0, [pCRow0] + fmla v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] add pCRow0, pCRow0, #8 + + ldr d1, [pCRow1] + fmla v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] + + add pCRow1, pCRow1, #8 + + ldr d0, [pCRow2] + fmla v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + ldr d1, [pCRow3] + fmla v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -686,22 +688,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] fmla v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC - ld1 {v12.s}[0], [pCRow2] - ld1 {v12.s}[1], [pCRow1] - fmla v12.2s, v20.2s, alphaV1 - st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] - add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + + ld1 {v12.s}[0], [pCRow2] + ld1 {v12.s}[1], [pCRow3] + fmla v12.2s, v20.2s, alphaV0 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow3] + + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -741,20 +746,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 - fmla v6.4s, v22.4s, alphaV2 - fmla v7.4s, v23.4s, alphaV3 + fmla v5.4s, v21.4s, alphaV0 + fmla v6.4s, v22.4s, alphaV0 + fmla v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -785,18 +792,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC ld1 {v4.4s, v5.4s}, [pCRow1] fmla v4.4s, v20.4s, alphaV0 - fmla v5.4s, v21.4s, alphaV1 + fmla v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -824,15 +833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC ld1 {v12.2s, v13.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV2 - fmla v13.2s, v21.2s, alphaV3 + fmla v12.2s, v20.2s, alphaV0 + fmla v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -857,13 +868,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC ld1 {v12.2s}, [pCRow1] - fmla v12.2s, v20.2s, alphaV1 + fmla v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -886,6 +899,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC ld1 {v8.s}[0], [pCRow0] ld1 {v8.s}[1], [pCRow1] @@ -925,11 +940,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 - fmla v2.4s, v18.4s, alphaV2 - fmla v3.4s, v19.4s, alphaV3 + fmla v1.4s, v17.4s, alphaV0 + fmla v2.4s, v18.4s, alphaV0 + fmla v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -956,9 +973,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha + ld1 {v0.4s, v1.4s}, [pCRow0] fmla v0.4s, v16.4s, alphaV0 - fmla v1.4s, v17.4s, alphaV1 + fmla v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -983,9 +1002,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha + ld1 {v8.2s, v9.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 - fmla v9.2s, v17.2s, alphaV1 + fmla v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -1008,6 +1029,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha + ld1 {v8.2s}, [pCRow0] fmla v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1032,6 +1055,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha + ldr s8, [pCRow0] fmla s8, s16, alphaV0 str s8, [pCRow0] @@ -1061,10 +1086,10 @@ sgemm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1078,8 +1103,12 @@ sgemm_kernel_begin: /******************************************************************************/ sgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC mov pA, origPA // pA = start of A array @@ -1090,42 +1119,69 @@ sgemm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble sgemm_kernel_L4_M8_BEGIN + .align 5 sgemm_kernel_L4_M16_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt sgemm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble sgemm_kernel_L4_M16_22a - .align 5 + .align 5 sgemm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt sgemm_kernel_L4_M16_22 + .align 5 sgemm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 + .align 5 sgemm_kernel_L4_M16_32: tst counterL, #1 ble sgemm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b sgemm_kernel_L4_M16_44 @@ -1136,14 +1192,20 @@ sgemm_kernel_L4_M16_40: sgemm_kernel_L4_M16_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble sgemm_kernel_L4_M16_100 + .align 5 sgemm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne sgemm_kernel_L4_M16_46 sgemm_kernel_L4_M16_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE16x4 diff --git a/kernel/arm64/strmm_kernel_16x4.S b/kernel/arm64/strmm_kernel_16x4.S index 28b321651..77e05103d 100644 --- a/kernel/arm64/strmm_kernel_16x4.S +++ b/kernel/arm64/strmm_kernel_16x4.S @@ -46,19 +46,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define temp x16 -#define tempOffset x17 -#define tempK x18 +#define pCRow3 x15 +#define pA x16 +#define alpha w17 +#define temp x18 +#define tempOffset x19 +#define tempK x20 #define alpha0 s10 #define alphaV0 v10.s[0] -#define alpha1 s11 -#define alphaV1 v11.s[0] -#define alpha2 s14 -#define alphaV2 v14.s[0] -#define alpha3 s15 -#define alphaV3 v15.s[0] + +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 224 +#define C_PRE_SIZE 160 + // 00 origM // 01 origN @@ -101,14 +102,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. //v05 pA1_04, pA1_05, pA1_06, pA1_07 //v06 pA1_08, pA1_09, pA1_10, pA1_11 //v07 pA1_12, pA1_13, pA1_14, pA1_15 -//v08 must save pB00, pB01 -//v09 must save pB02, pB03 -//v10 must save ALPHA0 -//v11 must save ALPHA1 -//v12 must save pB10, pB11 -//v13 must save pB12, pB13 -//v14 must save ALPHA2 -//v15 must save ALPHA3 +//v08 must save pB00 +//v09 must save pB01 +//v10 must save pB02 +//v11 must save pB03 +//v12 must save pB10 +//v13 must save pB11 +//v14 must save pB12 +//v15 must save pB13 //v16 must save C00, C01, C02, C03 //v17 must save C04, C05, C06, C07 //v18 C08, C09, C10, C11 @@ -150,202 +151,240 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL16x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + + ldp s8, s9, [pB], #8 fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmul v17.4s, v1.4s, v8.s[0] + fmul v21.4s, v1.4s, v9.s[0] + + ldp q4, q5, [pA], #32 + + fmul v25.4s, v1.4s, v10.s[0] + fmul v29.4s, v1.4s, v11.s[0] + + ldp s12, s13, [pB], #8 + fmul v18.4s, v2.4s, v8.s[0] + fmul v22.4s, v2.4s, v9.s[0] + + ldp s14, s15, [pB], #8 + fmul v19.4s, v3.4s, v8.s[0] + fmul v23.4s, v3.4s, v9.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v22.4s, v2.4s, v8.s[1] - fmul v23.4s, v3.4s, v8.s[1] + ldp q6, q7, [pA], #32 - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v26.4s, v2.4s, v9.s[0] - fmul v27.4s, v3.4s, v9.s[0] + fmul v26.4s, v2.4s, v10.s[0] + fmul v30.4s, v2.4s, v11.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] - fmul v30.4s, v2.4s, v9.s[1] - fmul v31.4s, v3.4s, v9.s[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + fmul v27.4s, v3.4s, v10.s[0] + fmul v31.4s, v3.4s, v11.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL16x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] + + ldp q4, q5, [pA], #32 + fmla v18.4s, v2.4s, v8.s[0] fmla v19.4s, v3.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + ldp s12, s13, [pB], #8 - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v22.4s, v2.4s, v9.s[0] + fmla v23.4s, v3.4s, v9.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 - ld1 {v6.4s}, [pA] - add pA, pA, #16 - ld1 {v7.4s}, [pA] - add pA, pA, #16 + ldp s14, s15, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] + + fmla v26.4s, v2.4s, v10.s[0] + fmla v27.4s, v3.4s, v10.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] + + ldp q6, q7, [pA], #32 + + fmla v30.4s, v2.4s, v11.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro KERNEL16x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] + + ldp q0, q1, [pA], #32 + fmla v18.4s, v6.4s, v12.s[0] fmla v19.4s, v7.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] + ldp s8, s9, [pB], #8 - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v22.4s, v6.4s, v13.s[0] + fmla v23.4s, v7.4s, v13.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp s10, s11, [pB], #8 + + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + + fmla v26.4s, v6.4s, v14.s[0] + fmla v27.4s, v7.4s, v14.s[0] + + ldp q2, q3, [pA], #32 + + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + fmla v30.4s, v6.4s, v15.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_E fmla v16.4s, v4.4s, v12.s[0] + fmla v20.4s, v4.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v17.4s, v5.4s, v12.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v29.4s, v5.4s, v15.s[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + fmla v18.4s, v6.4s, v12.s[0] + fmla v22.4s, v6.4s, v13.s[0] + fmla v26.4s, v6.4s, v14.s[0] + fmla v30.4s, v6.4s, v15.s[0] + fmla v19.4s, v7.4s, v12.s[0] - - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v22.4s, v6.4s, v12.s[1] - fmla v23.4s, v7.4s, v12.s[1] - - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v26.4s, v6.4s, v13.s[0] - fmla v27.4s, v7.4s, v13.s[0] - - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] - fmla v30.4s, v6.4s, v13.s[1] - fmla v31.4s, v7.4s, v13.s[1] + fmla v23.4s, v7.4s, v13.s[0] + fmla v27.4s, v7.4s, v14.s[0] + fmla v31.4s, v7.4s, v15.s[0] .endm .macro KERNEL16x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 - ld1 {v2.4s}, [pA] - add pA, pA, #16 - ld1 {v3.4s}, [pA] - add pA, pA, #16 + ldp q0, q1, [pA], #32 + ldp s8, s9, [pB], #8 fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + + ldp s10, s11, [pB], #8 + + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + + ldp q2, q3, [pA], #32 + fmla v17.4s, v1.4s, v8.s[0] + fmla v21.4s, v1.4s, v9.s[0] + + fmla v25.4s, v1.4s, v10.s[0] + fmla v29.4s, v1.4s, v11.s[0] + fmla v18.4s, v2.4s, v8.s[0] + fmla v22.4s, v2.4s, v9.s[0] + + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmla v19.4s, v3.4s, v8.s[0] + fmla v23.4s, v3.4s, v9.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v22.4s, v2.4s, v8.s[1] - fmla v23.4s, v3.4s, v8.s[1] + fmla v26.4s, v2.4s, v10.s[0] + fmla v30.4s, v2.4s, v11.s[0] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v26.4s, v2.4s, v9.s[0] - fmla v27.4s, v3.4s, v9.s[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] - fmla v30.4s, v2.4s, v9.s[1] - fmla v31.4s, v3.4s, v9.s[1] + fmla v27.4s, v3.4s, v10.s[0] + fmla v31.4s, v3.4s, v11.s[0] .endm .macro SAVE16x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha + + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] - add pCRow2, pCRow1, LDC + add pCRow0, pCRow0, #32 + + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 + stp q2, q3, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v21.4s, alphaV0 + stp q4, q5, [pCRow1] - add pCRow1, pCRow2, LDC + add pCRow1, pCRow1, #32 + + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 + stp q6, q7, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - fmul v2.4s, v26.4s, alphaV2 - fmul v3.4s, v27.4s, alphaV3 - st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow2] + fmul v1.4s, v25.4s, alphaV0 + stp q0, q1, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v2.4s, v26.4s, alphaV0 + fmul v3.4s, v27.4s, alphaV0 + stp q2, q3, [pCRow2] + + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - fmul v6.4s, v30.4s, alphaV2 - fmul v7.4s, v31.4s, alphaV3 - st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] + fmul v5.4s, v29.4s, alphaV0 + stp q4, q5, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 + fmul v6.4s, v30.4s, alphaV0 + fmul v7.4s, v31.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -362,260 +401,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL8x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmul v16.4s, v0.4s, v8.s[0] fmul v17.4s, v1.4s, v8.s[0] - fmul v20.4s, v0.4s, v8.s[1] - fmul v21.4s, v1.4s, v8.s[1] - fmul v24.4s, v0.4s, v9.s[0] - fmul v25.4s, v1.4s, v9.s[0] - fmul v28.4s, v0.4s, v9.s[1] - fmul v29.4s, v1.4s, v9.s[1] + fmul v20.4s, v0.4s, v9.s[0] + fmul v21.4s, v1.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v25.4s, v1.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] + fmul v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M1 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.4s}, [pA] - add pA, pA, #16 - ld1 {v5.4s}, [pA] - add pA, pA, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 + + ldr q4, [pA], #16 + ldr q5, [pA], #16 .endm .macro KERNEL8x4_M2 fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 .endm .macro KERNEL8x4_E fmla v16.4s, v4.4s, v12.s[0] fmla v17.4s, v5.4s, v12.s[0] - fmla v20.4s, v4.4s, v12.s[1] - fmla v21.4s, v5.4s, v12.s[1] - fmla v24.4s, v4.4s, v13.s[0] - fmla v25.4s, v5.4s, v13.s[0] - fmla v28.4s, v4.4s, v13.s[1] - fmla v29.4s, v5.4s, v13.s[1] + fmla v20.4s, v4.4s, v13.s[0] + fmla v21.4s, v5.4s, v13.s[0] + fmla v24.4s, v4.4s, v14.s[0] + fmla v25.4s, v5.4s, v14.s[0] + fmla v28.4s, v4.4s, v15.s[0] + fmla v29.4s, v5.4s, v15.s[0] .endm .macro KERNEL8x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.4s}, [pA] - add pA, pA, #16 - ld1 {v1.4s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr q0, [pA], #16 + ldr q1, [pA], #16 fmla v16.4s, v0.4s, v8.s[0] fmla v17.4s, v1.4s, v8.s[0] - fmla v20.4s, v0.4s, v8.s[1] - fmla v21.4s, v1.4s, v8.s[1] - fmla v24.4s, v0.4s, v9.s[0] - fmla v25.4s, v1.4s, v9.s[0] - fmla v28.4s, v0.4s, v9.s[1] - fmla v29.4s, v1.4s, v9.s[1] + fmla v20.4s, v0.4s, v9.s[0] + fmla v21.4s, v1.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v25.4s, v1.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] + fmla v29.4s, v1.4s, v11.s[0] .endm .macro SAVE8x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow0] - - add pCRow2, pCRow1, LDC - - fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] - - add pCRow1, pCRow2, LDC - - fmul v0.4s, v24.4s, alphaV0 - fmul v1.4s, v25.4s, alphaV1 - st1 {v0.4s, v1.4s}, [pCRow2] - - fmul v4.4s, v28.4s, alphaV0 - fmul v5.4s, v29.4s, alphaV1 - st1 {v4.4s, v5.4s}, [pCRow1] + fmul v1.4s, v17.4s, alphaV0 + stp q0, q1, [pCRow0] add pCRow0, pCRow0, #32 + + fmul v2.4s, v20.4s, alphaV0 + fmul v3.4s, v21.4s, alphaV0 + stp q2, q3, [pCRow1] + + add pCRow1, pCRow1, #32 + + fmul v4.4s, v24.4s, alphaV0 + fmul v5.4s, v25.4s, alphaV0 + stp q4, q5, [pCRow2] + + add pCRow2, pCRow2, #32 + + fmul v6.4s, v28.4s, alphaV0 + fmul v7.4s, v29.4s, alphaV0 + stp q6, q7, [pCRow3] + + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ .macro INIT4x4 fmov s16, wzr - fmov s17, s16 - fmov s20, s17 - fmov s21, s16 - fmov s24, s17 - fmov s25, s16 - fmov s28, s17 - fmov s29, s16 + fmov s20, wzr + fmov s24, wzr + fmov s28, wzr .endm .macro KERNEL4x4_I - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmul v16.2s, v0.2s, v8.s[0] - fmul v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmul v20.2s, v0.2s, v8.s[1] - fmul v25.2s, v1.2s, v9.s[0] + fmul v16.4s, v0.4s, v8.s[0] + fmul v20.4s, v0.4s, v9.s[0] + fmul v24.4s, v0.4s, v10.s[0] + fmul v28.4s, v0.4s, v11.s[0] - fmul v24.2s, v0.2s, v9.s[0] - fmul v21.2s, v1.2s, v8.s[1] + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmul v28.2s, v0.2s, v9.s[1] - fmul v17.2s, v1.2s, v8.s[0] - - ld1 {v12.2s, v13.2s}, [pB] - add pB, pB, #16 - ld1 {v4.2s, v5.2s}, [pA] - add pA, pA, #16 + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M1 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] - ld1 {v12.2s, v13.2s}, [pB] // For next round - add pB, pB, #16 + ldp s12, s13, [pB], #8 + ldp s14, s15, [pB], #8 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - ld1 {v4.2s, v5.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - prfm PLDL1KEEP, [pB, #512] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + ldr q1, [pA], #16 .endm .macro KERNEL4x4_M2 - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] - ld1 {v8.2s, v9.2s}, [pB] // For next round - add pB, pB, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - ld1 {v0.2s, v1.2s}, [pA] // For next round - add pA, pA, #16 - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - prfm PLDL1KEEP, [pA, #512] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + ldr q0, [pA], #16 .endm .macro KERNEL4x4_E - fmla v16.2s, v4.2s, v12.s[0] - fmla v29.2s, v5.2s, v13.s[1] - - fmla v20.2s, v4.2s, v12.s[1] - fmla v25.2s, v5.2s, v13.s[0] - - fmla v24.2s, v4.2s, v13.s[0] - fmla v21.2s, v5.2s, v12.s[1] - - fmla v28.2s, v4.2s, v13.s[1] - fmla v17.2s, v5.2s, v12.s[0] + fmla v16.4s, v1.4s, v12.s[0] + fmla v20.4s, v1.4s, v13.s[0] + fmla v24.4s, v1.4s, v14.s[0] + fmla v28.4s, v1.4s, v15.s[0] .endm .macro KERNEL4x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s, v1.2s}, [pA] - add pA, pA, #16 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 - fmla v16.2s, v0.2s, v8.s[0] - fmla v29.2s, v1.2s, v9.s[1] + ldr q0, [pA], #16 - fmla v20.2s, v0.2s, v8.s[1] - fmla v25.2s, v1.2s, v9.s[0] - - fmla v24.2s, v0.2s, v9.s[0] - fmla v21.2s, v1.2s, v8.s[1] - - fmla v28.2s, v0.2s, v9.s[1] - fmla v17.2s, v1.2s, v8.s[0] + fmla v16.4s, v0.4s, v8.s[0] + fmla v20.4s, v0.4s, v9.s[0] + fmla v24.4s, v0.4s, v10.s[0] + fmla v28.4s, v0.4s, v11.s[0] .endm .macro SAVE4x4 + fmov alpha0, alpha - fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow0] - - add pCRow1, pCRow0, LDC - - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - - fmul v8.2s, v24.2s, alphaV0 - fmul v9.2s, v25.2s, alphaV1 - st1 {v8.2s, v9.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - - fmul v12.2s, v28.2s, alphaV2 - fmul v13.2s, v29.2s, alphaV3 - st1 {v12.2s, v13.2s}, [pCRow1] + fmul v0.4s, v16.4s, alphaV0 + str q0, [pCRow0] add pCRow0, pCRow0, #16 + + fmul v1.4s, v20.4s, alphaV0 + str q1, [pCRow1] + + add pCRow1, pCRow1, #16 + + fmul v2.4s, v24.4s, alphaV0 + str q2, [pCRow2] + + add pCRow2, pCRow2, #16 + + fmul v3.4s, v28.4s, alphaV0 + str q3, [pCRow3] + + add pCRow3, pCRow3, #16 .endm /******************************************************************************/ @@ -628,34 +616,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro KERNEL2x4_SUB - ld1 {v8.2s, v9.2s}, [pB] - add pB, pB, #16 - ld1 {v0.2s}, [pA] - add pA, pA, #8 + ldp s8, s9, [pB], #8 + ldp s10, s11, [pB], #8 + + ldr d0, [pA], #8 fmla v16.2s, v0.2s, v8.s[0] - fmla v20.2s, v0.2s, v8.s[1] - fmla v24.2s, v0.2s, v9.s[0] - fmla v28.2s, v0.2s, v9.s[1] + fmla v20.2s, v0.2s, v9.s[0] + fmla v24.2s, v0.2s, v10.s[0] + fmla v28.2s, v0.2s, v11.s[0] .endm .macro SAVE2x4 - fmul v8.2s, v16.2s, alphaV0 - st1 {v8.2s}, [pCRow0] + fmov alpha0, alpha - add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.2s}, [pCRow1] - - add pCRow2, pCRow1, LDC - fmul v8.2s, v24.2s, alphaV2 - st1 {v8.2s}, [pCRow2] - - add pCRow1, pCRow2, LDC - fmul v12.2s, v28.2s, alphaV3 - st1 {v12.2s}, [pCRow1] + fmul v0.2s, v16.2s, alphaV0 + str d0, [pCRow0] add pCRow0, pCRow0, #8 + + fmul v1.2s, v20.2s, alphaV0 + str d1, [pCRow1] + + add pCRow1, pCRow1, #8 + + fmul v0.2s, v24.2s, alphaV0 + str d0, [pCRow2] + + add pCRow2, pCRow2, #8 + + fmul v1.2s, v28.2s, alphaV0 + str d1, [pCRow3] + + add pCRow3, pCRow3, #8 .endm /******************************************************************************/ @@ -677,20 +670,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - add pCRow1, pCRow0, LDC + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.s}[0], [pCRow0] st1 {v8.s}[1], [pCRow1] - add pCRow2, pCRow1, LDC - add pCRow1, pCRow2, LDC - - fmul v12.2s, v20.2s, alphaV1 - st1 {v12.s}[0], [pCRow2] - st1 {v12.s}[1], [pCRow1] - add pCRow0, pCRow0, #4 + add pCRow1, pCRow1, #4 + + fmul v12.2s, v20.2s, alphaV0 + st1 {v12.s}[0], [pCRow2] + st1 {v12.s}[1], [pCRow3] + + add pCRow2, pCRow2, #4 + add pCRow3, pCRow3, #4 .endm /******************************************************************************/ @@ -730,18 +724,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 - fmul v6.4s, v22.4s, alphaV2 - fmul v7.4s, v23.4s, alphaV3 + fmul v5.4s, v21.4s, alphaV0 + fmul v6.4s, v22.4s, alphaV0 + fmul v7.4s, v23.4s, alphaV0 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] add pCRow0, pCRow0, #64 @@ -772,16 +768,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x2 + fmov alpha0, alpha + add pCRow1, pCRow0, LDC fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow2, pCRow1, LDC fmul v4.4s, v20.4s, alphaV0 - fmul v5.4s, v21.4s, alphaV1 + fmul v5.4s, v21.4s, alphaV0 st1 {v4.4s, v5.4s}, [pCRow1] add pCRow0, pCRow0, #32 @@ -809,15 +807,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow1, pCRow0, LDC - fmul v12.2s, v20.2s, alphaV2 - fmul v13.2s, v21.2s, alphaV3 + fmul v12.2s, v20.2s, alphaV0 + fmul v13.2s, v21.2s, alphaV0 st1 {v12.2s, v13.2s}, [pCRow1] add pCRow0, pCRow0, #16 @@ -842,12 +841,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 + fmov alpha0, alpha + fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] add pCRow1 , pCRow0, LDC - fmul v12.2s, v20.2s, alphaV1 + fmul v12.2s, v20.2s, alphaV0 st1 {v12.2s}, [pCRow1] add pCRow0, pCRow0, #8 @@ -870,6 +871,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 + fmov alpha0, alpha + add pCRow1 , pCRow0, LDC fmul v8.2s, v16.2s, alphaV0 @@ -908,11 +911,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE16x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 - fmul v2.4s, v18.4s, alphaV2 - fmul v3.4s, v19.4s, alphaV3 + fmul v1.4s, v17.4s, alphaV0 + fmul v2.4s, v18.4s, alphaV0 + fmul v3.4s, v19.4s, alphaV0 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow0] add pCRow0, pCRow0, #64 @@ -939,9 +943,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE8x1 + fmov alpha0, alpha fmul v0.4s, v16.4s, alphaV0 - fmul v1.4s, v17.4s, alphaV1 + fmul v1.4s, v17.4s, alphaV0 st1 {v0.4s, v1.4s}, [pCRow0] add pCRow0, pCRow0, #32 @@ -966,9 +971,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 - fmul v9.2s, v17.2s, alphaV1 + fmul v9.2s, v17.2s, alphaV0 st1 {v8.2s, v9.2s}, [pCRow0] add pCRow0, pCRow0, #16 @@ -991,6 +997,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 + fmov alpha0, alpha fmul v8.2s, v16.2s, alphaV0 st1 {v8.2s}, [pCRow0] @@ -1015,6 +1022,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 + fmov alpha0, alpha fmul s8, s16, alpha0 str s8, [pCRow0] @@ -1043,10 +1051,10 @@ strmm_kernel_begin: stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha0, s0 - fmov alpha1, s0 - fmov alpha2, s0 - fmov alpha3, s0 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alpha, s0 lsl LDC, LDC, #2 // ldc = ldc * 4 @@ -1063,8 +1071,13 @@ strmm_kernel_begin: /******************************************************************************/ strmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1078,6 +1091,7 @@ strmm_kernel_L4_M16_BEGIN: cmp counterI, #0 ble strmm_kernel_L4_M8_BEGIN + .align 5 strmm_kernel_L4_M16_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,38 +1112,64 @@ strmm_kernel_L4_M16_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt strmm_kernel_L4_M16_32 - KERNEL16x4_I // do one in the K - KERNEL16x4_M2 // do another in the K + KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #2 ble strmm_kernel_L4_M16_22a - .align 5 + .align 5 strmm_kernel_L4_M16_22: KERNEL16x4_M1 KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 subs counterL, counterL, #1 bgt strmm_kernel_L4_M16_22 + .align 5 strmm_kernel_L4_M16_22a: + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 + .align 5 strmm_kernel_L4_M16_32: tst counterL, #1 ble strmm_kernel_L4_M16_40 KERNEL16x4_I + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 + KERNEL16x4_M2 + KERNEL16x4_M1 KERNEL16x4_E b strmm_kernel_L4_M16_44 @@ -1140,12 +1180,15 @@ strmm_kernel_L4_M16_40: strmm_kernel_L4_M16_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble strmm_kernel_L4_M16_100 + .align 5 strmm_kernel_L4_M16_46: KERNEL16x4_SUB + subs counterL, counterL, #1 + bne strmm_kernel_L4_M16_46 strmm_kernel_L4_M16_100: @@ -1166,6 +1209,9 @@ strmm_kernel_L4_M16_100: #if defined(LEFT) add tempOffset, tempOffset, #16 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] strmm_kernel_L4_M16_END: subs counterI, counterI, #1 diff --git a/kernel/arm64/zgemm_kernel_4x4.S b/kernel/arm64/zgemm_kernel_4x4.S index 1cb695e56..08a1531cf 100644 --- a/kernel/arm64/zgemm_kernel_4x4.S +++ b/kernel/arm64/zgemm_kernel_4x4.S @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I // 19 must save // 20 must save // 21 must save @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] - ld2 {v0.2d, v1.2d}, [pCRow1] + ld2 {v0.2d, v1.2d}, [pCRow0] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v2.2d, v3.2d}, [pCRow2] + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + + ld2 {v2.2d, v3.2d}, [pCRow0] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + + add pCRow1, pCRow1, #32 + + ld2 {v6.2d, v7.2d}, [pCRow1] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] - add pCRow1, pCRow1, LDC - ld2 {v0.2d, v1.2d}, [pCRow1] + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] + + ld2 {v0.2d, v1.2d}, [pCRow2] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmla v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmla v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] - ld2 {v4.2d, v5.2d}, [pCRow1] + ld2 {v4.2d, v5.2d}, [pCRow3] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 - ld2 {v6.2d, v7.2d}, [pCRow2] + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + + ld2 {v6.2d, v7.2d}, [pCRow3] fmla v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmla v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmla v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmla v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmla v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmla v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmla v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmla d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmla d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmla d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmla d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v6.2d, v7.2d}, [pCRow2] fmla v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmla v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmla v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.2d, v5.2d}, [pCRow1] fmla v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmla v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmla v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ld2 {v4.d, v5.d}[0], [pCRow1] fmla d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmla d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmla d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 ld2 {v2.2d, v3.2d}, [pCRow2] fmla v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmla v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmla v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.2d, v1.2d}, [pCRow1] fmla v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmla v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmla v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 ld2 {v0.d, v1.d}[0], [pCRow1] fmla d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmla d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmla d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble zgemm_kernel_L2_BEGIN zgemm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + mov pA, origPA // pA = start of A array zgemm_kernel_L4_M4_BEGIN: @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble zgemm_kernel_L4_M2_BEGIN + .align 5 zgemm_kernel_L4_M4_20: mov pB, origPB - asr counterL , origK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , origK, #3 + cmp counterL , #2 blt zgemm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 // subtract 2 ble zgemm_kernel_L4_M4_22a - .align 5 + .align 5 zgemm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt zgemm_kernel_L4_M4_22 - + .align 5 zgemm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 + .align 5 zgemm_kernel_L4_M4_32: tst counterL, #1 ble zgemm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b zgemm_kernel_L4_M4_44 @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: zgemm_kernel_L4_M4_44: - ands counterL , origK, #1 + ands counterL , origK, #7 ble zgemm_kernel_L4_M4_100 + .align 5 zgemm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne zgemm_kernel_L4_M4_46 + zgemm_kernel_L4_M4_100: + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] SAVE4x4 diff --git a/kernel/arm64/ztrmm_kernel_4x4.S b/kernel/arm64/ztrmm_kernel_4x4.S index 7945870d6..77a7857ff 100644 --- a/kernel/arm64/ztrmm_kernel_4x4.S +++ b/kernel/arm64/ztrmm_kernel_4x4.S @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define pCRow0 x12 #define pCRow1 x13 #define pCRow2 x14 -#define pA x15 -#define alpha_save_R x16 -#define alpha_save_I x17 -#define temp x18 -#define tempOffset x19 -#define tempK x20 +#define pCRow3 x15 +#define pA x16 +#define alphaR x17 +#define alphaI x18 +#define temp x19 +#define tempOffset x20 +#define tempK x21 #define alpha0_R d10 #define alphaV0_R v10.d[0] #define alpha0_I d11 #define alphaV0_I v11.d[0] -#define alpha1_R d14 -#define alphaV1_R v14.d[0] -#define alpha1_I d15 -#define alphaV1_I v15.d[0] - +#define A_PRE_SIZE 2560 +#define B_PRE_SIZE 448 +#define C_PRE_SIZE 128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define OP_rr fmla @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 04 origPB // 05 pC // 06 origLDC -> LDC -// 07 offset +// 07 offset -> temp // 08 counterL // 09 counterI // 10 counterJ @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // 12 pCRow0 // 13 pCRow1 // 14 pCRow2 -// 15 pA -// 16 alpha_save_R -// 17 alpha_save_I -// 18 must save temp -// 19 must save tempOffset -// 20 must save tempK -// 21 must save +// 15 pCRow3 +// 16 pA +// 17 alpha_save_R +// 18 must save alpha_save_I +// 19 must save temp +// 20 must save tempOffset +// 21 must save tempK // 22 must save // 23 must save // 24 must save @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_I ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 fmul v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v17.2d, v1.2d, v8.d[0] - fmul v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] -#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ - defined(RR) || defined(RC) || defined(CR) || defined(CC) - eor v19.16b, v19.16b, v19.16b - fmls v19.2d, v2.2d, v9.d[0] -#else - fmul v19.2d, v2.2d, v9.d[0] -#endif - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 fmul v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + fmul v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v23.2d, v3.2d, v8.d[1] + ld2 {v12.2d, v13.2d}, [pB] + add pB, pB, #32 + + fmul v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] +#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ + defined(RR) || defined(RC) || defined(CR) || defined(CC) + eor v19.16b, v19.16b, v19.16b + fmls v19.2d, v2.2d, v9.d[0] +#else + fmul v19.2d, v2.2d, v9.d[0] +#endif + OP_ir v19.2d, v3.2d, v8.d[0] + + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 + fmul v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v25.2d, v1.2d, v10.d[0] + ld2 {v6.2d, v7.2d} , [pA] + add pA, pA, #32 + fmul v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v27.2d, v3.2d, v10.d[0] + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 + fmul v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v29.2d, v1.2d, v10.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + fmul v30.2d, v2.2d, v10.d[1] OP_ii v30.2d, v3.2d, v11.d[1] #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif OP_ir v31.2d, v3.2d, v10.d[1] - ld2 {v12.2d, v13.2d}, [pB] - add pB, pB, #32 - ld2 {v14.2d, v15.2d}, [pB] - add pB, pB, #32 - ld2 {v4.2d, v5.2d} , [pA] - add pA, pA, #32 - ld2 {v6.2d, v7.2d} , [pA] - add pA, pA, #32 + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] .endm .macro KERNEL4x4_M1 @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - ld2 {v12.2d, v13.2d}, [pB] // For next round + ld2 {v12.2d, v13.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v2.2d, v8.d[0] @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v2.2d, v9.d[0] OP_ir v19.2d, v3.2d, v8.d[0] - ld2 {v14.2d, v15.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v4.2d, v5.2d} , [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] - ld2 {v4.2d, v5.2d} , [pA] // For next round + ld2 {v6.2d, v7.2d} , [pA] add pA, pA, #32 OP_rr v22.2d, v2.2d, v8.d[1] @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] - ld2 {v6.2d, v7.2d} , [pA] // For next round - add pA, pA, #32 + ld2 {v14.2d, v15.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] OP_ir v25.2d, v1.2d, v10.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] OP_rr v26.2d, v2.2d, v10.d[0] OP_ii v26.2d, v3.2d, v11.d[0] OP_ri v27.2d, v2.2d, v11.d[0] OP_ir v27.2d, v3.2d, v10.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] OP_rr v28.2d, v0.2d, v10.d[1] OP_ii v28.2d, v1.2d, v11.d[1] @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v17.2d, v4.2d, v13.d[0] OP_ir v17.2d, v5.2d, v12.d[0] - ld2 {v8.2d, v9.2d}, [pB] // For next round + ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 OP_rr v18.2d, v6.2d, v12.d[0] @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v19.2d, v6.2d, v13.d[0] OP_ir v19.2d, v7.2d, v12.d[0] - ld2 {v10.2d, v11.2d}, [pB] // For next round - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v4.2d, v12.d[1] OP_ii v20.2d, v5.2d, v13.d[1] OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] - ld2 {v0.2d, v1.2d}, [pA] // For next round + ld2 {v2.2d, v3.2d}, [pA] add pA, pA, #32 OP_rr v22.2d, v6.2d, v12.d[1] @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v23.2d, v6.2d, v13.d[1] OP_ir v23.2d, v7.2d, v12.d[1] - ld2 {v2.2d, v3.2d}, [pA] // For next round - add pA, pA, #32 + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 OP_rr v24.2d, v4.2d, v14.d[0] OP_ii v24.2d, v5.2d, v15.d[0] OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] - prfm PLDL1KEEP, [pA, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] OP_ir v27.2d, v7.2d, v14.d[0] - prfm PLDL1KEEP, [pB, #512] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] OP_rr v28.2d, v4.2d, v14.d[1] OP_ii v28.2d, v5.2d, v15.d[1] @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v21.2d, v4.2d, v13.d[1] OP_ir v21.2d, v5.2d, v12.d[1] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v6.2d, v12.d[1] OP_ii v22.2d, v7.2d, v13.d[1] OP_ri v23.2d, v6.2d, v13.d[1] @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. OP_ri v25.2d, v4.2d, v15.d[0] OP_ir v25.2d, v5.2d, v14.d[0] + prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] + OP_rr v26.2d, v6.2d, v14.d[0] OP_ii v26.2d, v7.2d, v15.d[0] OP_ri v27.2d, v6.2d, v15.d[0] @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .macro KERNEL4x4_SUB ld2 {v8.2d, v9.2d}, [pB] add pB, pB, #32 - ld2 {v10.2d, v11.2d}, [pB] - add pB, pB, #32 + ld2 {v0.2d, v1.2d}, [pA] add pA, pA, #32 - ld2 {v2.2d, v3.2d}, [pA] - add pA, pA, #32 OP_rr v16.2d, v0.2d, v8.d[0] OP_ii v16.2d, v1.2d, v9.d[0] OP_ri v17.2d, v0.2d, v9.d[0] OP_ir v17.2d, v1.2d, v8.d[0] - OP_rr v18.2d, v2.2d, v8.d[0] - OP_ii v18.2d, v3.2d, v9.d[0] - OP_ri v19.2d, v2.2d, v9.d[0] - OP_ir v19.2d, v3.2d, v8.d[0] + ld2 {v2.2d, v3.2d}, [pA] + add pA, pA, #32 OP_rr v20.2d, v0.2d, v8.d[1] OP_ii v20.2d, v1.2d, v9.d[1] OP_ri v21.2d, v0.2d, v9.d[1] OP_ir v21.2d, v1.2d, v8.d[1] + ld2 {v10.2d, v11.2d}, [pB] + add pB, pB, #32 + + OP_rr v18.2d, v2.2d, v8.d[0] + OP_ii v18.2d, v3.2d, v9.d[0] + OP_ri v19.2d, v2.2d, v9.d[0] + OP_ir v19.2d, v3.2d, v8.d[0] + + prfm PLDL1KEEP, [pB, #B_PRE_SIZE] + OP_rr v22.2d, v2.2d, v8.d[1] OP_ii v22.2d, v3.2d, v9.d[1] OP_ri v23.2d, v2.2d, v9.d[1] OP_ir v23.2d, v3.2d, v8.d[1] + prfm PLDL1KEEP, [pA, #A_PRE_SIZE] + OP_rr v24.2d, v0.2d, v10.d[0] OP_ii v24.2d, v1.2d, v11.d[0] OP_ri v25.2d, v0.2d, v11.d[0] @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI - mov pCRow1, pCRow0 + prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R - st2 {v2.2d, v3.2d}, [pCRow2] + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R + st2 {v2.2d, v3.2d}, [pCRow0] + + add pCRow0, pCRow0, #32 + prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + + add pCRow1, pCRow1, #32 + fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow1] + + add pCRow1, pCRow1, #32 + prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] - add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R - st2 {v0.2d, v1.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R + st2 {v0.2d, v1.2d}, [pCRow2] + + add pCRow2, pCRow2, #32 + fmul v2.2d, v26.2d, alphaV0_R fmls v2.2d, v27.2d, alphaV0_I - fmul v3.2d, v26.2d, alphaV1_I - fmla v3.2d, v27.2d, alphaV1_R + fmul v3.2d, v26.2d, alphaV0_I + fmla v3.2d, v27.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] - add pCRow1, pCRow1, LDC + add pCRow2, pCRow2, #32 + prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R - st2 {v4.2d, v5.2d}, [pCRow1] - add pCRow2, pCRow1, #32 + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R + st2 {v4.2d, v5.2d}, [pCRow3] + + add pCRow3, pCRow3, #32 + fmul v6.2d, v30.2d, alphaV0_R fmls v6.2d, v31.2d, alphaV0_I - fmul v7.2d, v30.2d, alphaV1_I - fmla v7.2d, v31.2d, alphaV1_R - st2 {v6.2d, v7.2d}, [pCRow2] + fmul v7.2d, v30.2d, alphaV0_I + fmla v7.2d, v31.2d, alphaV0_R + st2 {v6.2d, v7.2d}, [pCRow3] - add pCRow0, pCRow0, #64 + add pCRow3, pCRow3, #32 .endm /******************************************************************************/ @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v0.2d, v24.2d, alphaV0_R fmls v0.2d, v25.2d, alphaV0_I - fmul v1.2d, v24.2d, alphaV1_I - fmla v1.2d, v25.2d, alphaV1_R + fmul v1.2d, v24.2d, alphaV0_I + fmla v1.2d, v25.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v28.2d, alphaV0_R fmls v4.2d, v29.2d, alphaV0_I - fmul v5.2d, v28.2d, alphaV1_I - fmla v5.2d, v29.2d, alphaV1_R + fmul v5.2d, v28.2d, alphaV0_I + fmla v5.2d, v29.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x4 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d0, d24, alphaV0_R fmls d0, d25, alphaV0_I - fmul d1, d24, alphaV1_I - fmla d1, d25, alphaV1_R + fmul d1, d24, alphaV0_I + fmla d1, d25, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d28, alphaV0_R fmls d4, d29, alphaV0_I - fmul d5, d28, alphaV1_I - fmla d5, d29, alphaV1_R + fmul d5, d28, alphaV0_I + fmla d5, d29, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v6.2d, v22.2d, alphaV0_R fmls v6.2d, v23.2d, alphaV0_I - fmul v7.2d, v22.2d, alphaV1_I - fmla v7.2d, v23.2d, alphaV1_R + fmul v7.2d, v22.2d, alphaV0_I + fmla v7.2d, v23.2d, alphaV0_R st2 {v6.2d, v7.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow1, pCRow1, LDC fmul v4.2d, v20.2d, alphaV0_R fmls v4.2d, v21.2d, alphaV0_I - fmul v5.2d, v20.2d, alphaV1_I - fmla v5.2d, v21.2d, alphaV1_R + fmul v5.2d, v20.2d, alphaV0_I + fmla v5.2d, v21.2d, alphaV0_R st2 {v4.2d, v5.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x2 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow1, pCRow1, LDC fmul d4, d20, alphaV0_R fmls d4, d21, alphaV0_I - fmul d5, d20, alphaV1_I - fmla d5, d21, alphaV1_R + fmul d5, d20, alphaV0_I + fmla d5, d21, alphaV0_R st2 {v4.d, v5.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE4x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow2, pCRow1, #32 fmul v2.2d, v18.2d, alphaV0_R fmls v2.2d, v19.2d, alphaV0_I - fmul v3.2d, v18.2d, alphaV1_I - fmla v3.2d, v19.2d, alphaV1_R + fmul v3.2d, v18.2d, alphaV0_I + fmla v3.2d, v19.2d, alphaV0_R st2 {v2.2d, v3.2d}, [pCRow2] add pCRow0, pCRow0, #64 @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE2x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul v0.2d, v16.2d, alphaV0_R fmls v0.2d, v17.2d, alphaV0_I - fmul v1.2d, v16.2d, alphaV1_I - fmla v1.2d, v17.2d, alphaV1_R + fmul v1.2d, v16.2d, alphaV0_I + fmla v1.2d, v17.2d, alphaV0_R st2 {v0.2d, v1.2d}, [pCRow1] add pCRow0, pCRow0, #32 @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. .endm .macro SAVE1x1 - fmov alpha0_R, alpha_save_R - fmov alpha0_I, alpha_save_I - fmov alpha1_R, alpha0_R - fmov alpha1_I, alpha0_I + fmov alpha0_R, alphaR + fmov alpha0_I, alphaI mov pCRow1, pCRow0 fmul d0, d16, alphaV0_R fmls d0, d17, alphaV0_I - fmul d1, d16, alphaV1_I - fmla d1, d17, alphaV1_R + fmul d1, d16, alphaV0_I + fmla d1, d17, alphaV0_R st2 {v0.d, v1.d}[0], [pCRow1] add pCRow0, pCRow0, #16 @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stp x26, x27, [sp, #(9 * 16)] str x28, [sp, #(10 * 16)] - fmov alpha_save_R, d0 - fmov alpha_save_I, d1 + prfm PLDL1KEEP, [origPB] + prfm PLDL1KEEP, [origPA] + + fmov alphaR, d0 + fmov alphaI, d1 lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ble ztrmm_kernel_L2_BEGIN ztrmm_kernel_L4_BEGIN: - mov pCRow0, pC // pCRow0 = C - add pC, pC, LDC, lsl #2 + mov pCRow0, pC + add pCRow1, pCRow0, LDC + add pCRow2, pCRow1, LDC + add pCRow3, pCRow2, LDC + + add pC, pCRow3, LDC + #if defined(LEFT) mov tempOffset, offset @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: cmp counterI, #0 ble ztrmm_kernel_L4_M2_BEGIN + .align 5 ztrmm_kernel_L4_M4_20: #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: add tempK, tempOffset, #4 #endif - asr counterL , tempK, #1 // L = K / 2 - cmp counterL , #2 // is there at least 4 to do? + asr counterL , tempK, #3 + cmp counterL , #2 blt ztrmm_kernel_L4_M4_32 - KERNEL4x4_I // do one in the K - KERNEL4x4_M2 // do another in the K + KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #2 ble ztrmm_kernel_L4_M4_22a - .align 5 + .align 5 ztrmm_kernel_L4_M4_22: KERNEL4x4_M1 KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 subs counterL, counterL, #1 bgt ztrmm_kernel_L4_M4_22 - + .align 5 ztrmm_kernel_L4_M4_22a: + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 + .align 5 ztrmm_kernel_L4_M4_32: tst counterL, #1 ble ztrmm_kernel_L4_M4_40 KERNEL4x4_I + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 + KERNEL4x4_M2 + KERNEL4x4_M1 KERNEL4x4_E b ztrmm_kernel_L4_M4_44 @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: ztrmm_kernel_L4_M4_44: - ands counterL , tempK, #1 + ands counterL , tempK, #7 ble ztrmm_kernel_L4_M4_100 + .align 5 ztrmm_kernel_L4_M4_46: KERNEL4x4_SUB + subs counterL, counterL, #1 + bne ztrmm_kernel_L4_M4_46 + ztrmm_kernel_L4_M4_100: SAVE4x4 @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: add tempOffset, tempOffset, #4 #endif + prfm PLDL1KEEP, [pA] + prfm PLDL1KEEP, [pA, #64] + prfm PLDL1KEEP, [origPB] + ztrmm_kernel_L4_M4_END: subs counterI, counterI, #1 bne ztrmm_kernel_L4_M4_20 diff --git a/param.h b/param.h index fdc9d1104..7635cb8fc 100644 --- a/param.h +++ b/param.h @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 8 #define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_M 8 #define CGEMM_DEFAULT_UNROLL_N 4 #define ZGEMM_DEFAULT_UNROLL_M 4